51: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 55: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 57: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 61: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 52: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 54: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 56: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 58: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 62: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 16: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 60: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 2: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 53: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 59: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 63: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 21: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 60: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 4: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 10: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 0: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 9: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 19: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 15: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 8: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 11: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 17: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 5: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 33: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 12: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 60: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 14: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 18: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 26: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 13: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 23: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 27: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 29: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 1: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 22: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 30: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 24: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 28: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 20: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 40: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 6: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 32: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 36: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 37: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 52: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 56: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 60: START 2068237: Fri Nov 25 16:59:59 EET 2022 25: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 31: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 34: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 44: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 46: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 49: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 39: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 38: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 52: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 52: START 2068237: Fri Nov 25 16:59:59 EET 2022 56: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 56: START 2068237: Fri Nov 25 16:59:59 EET 2022 35: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 47: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 41: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 50: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 42: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 48: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 45: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 51: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 51: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 2: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 3: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 2: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 51: START 2068237: Fri Nov 25 16:59:59 EET 2022 2: START 2068237: Fri Nov 25 16:59:59 EET 2022 54: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 54: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 16: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 16: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 54: START 2068237: Fri Nov 25 16:59:59 EET 2022 16: START 2068237: Fri Nov 25 16:59:59 EET 2022 43: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 62: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 62: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 62: START 2068237: Fri Nov 25 16:59:59 EET 2022 58: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 58: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 58: START 2068237: Fri Nov 25 16:59:59 EET 2022 57: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 57: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 61: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 57: START 2068237: Fri Nov 25 16:59:59 EET 2022 61: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 7: Model parameters: d_model 2944 ffw_size 11776 kv_size 128 n_heads 23 n_layers 36 61: START 2068237: Fri Nov 25 16:59:59 EET 2022 55: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 55: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 55: START 2068237: Fri Nov 25 16:59:59 EET 2022 4: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 4: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 53: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 4: START 2068237: Fri Nov 25 16:59:59 EET 2022 53: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 59: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 59: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 53: START 2068237: Fri Nov 25 16:59:59 EET 2022 59: START 2068237: Fri Nov 25 16:59:59 EET 2022 0: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 0: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 60: 60: 60: ======================= ROCm System Management Interface ======================= 60: ================================= Concise Info ================================= 60: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 60: 0 49.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: 4 49.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: 6 43.0c 79.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: ================================================================================ 60: ============================= End of ROCm SMI Log ============================== 0: START 2068237: Fri Nov 25 16:59:59 EET 2022 10: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 10: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 10: START 2068237: Fri Nov 25 16:59:59 EET 2022 63: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 63: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 21: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 63: START 2068237: Fri Nov 25 16:59:59 EET 2022 21: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 9: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 21: START 2068237: Fri Nov 25 16:59:59 EET 2022 9: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 9: START 2068237: Fri Nov 25 16:59:59 EET 2022 12: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 12: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 12: START 2068237: Fri Nov 25 16:59:59 EET 2022 5: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 5: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 5: START 2068237: Fri Nov 25 16:59:59 EET 2022 17: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 17: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 17: START 2068237: Fri Nov 25 16:59:59 EET 2022 33: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 33: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 33: START 2068237: Fri Nov 25 16:59:59 EET 2022 15: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 15: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 15: START 2068237: Fri Nov 25 16:59:59 EET 2022 19: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 19: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 19: START 2068237: Fri Nov 25 16:59:59 EET 2022 8: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 8: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 8: START 2068237: Fri Nov 25 16:59:59 EET 2022 11: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 11: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 11: START 2068237: Fri Nov 25 16:59:59 EET 2022 1: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 1: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 1: START 2068237: Fri Nov 25 16:59:59 EET 2022 30: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 30: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 14: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 14: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 30: START 2068237: Fri Nov 25 16:59:59 EET 2022 14: START 2068237: Fri Nov 25 16:59:59 EET 2022 18: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 18: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 18: START 2068237: Fri Nov 25 16:59:59 EET 2022 28: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 28: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 28: START 2068237: Fri Nov 25 16:59:59 EET 2022 13: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 13: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 13: START 2068237: Fri Nov 25 16:59:59 EET 2022 40: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 40: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 40: START 2068237: Fri Nov 25 16:59:59 EET 2022 27: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 27: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 29: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 29: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 27: START 2068237: Fri Nov 25 16:59:59 EET 2022 29: START 2068237: Fri Nov 25 16:59:59 EET 2022 20: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 20: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 20: START 2068237: Fri Nov 25 16:59:59 EET 2022 24: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 24: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 24: START 2068237: Fri Nov 25 16:59:59 EET 2022 6: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 6: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 6: START 2068237: Fri Nov 25 16:59:59 EET 2022 23: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 23: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 23: START 2068237: Fri Nov 25 16:59:59 EET 2022 26: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 26: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 26: START 2068237: Fri Nov 25 16:59:59 EET 2022 32: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 32: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 32: START 2068237: Fri Nov 25 16:59:59 EET 2022 36: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 36: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 56: 56: 56: ======================= ROCm System Management Interface ======================= 56: ================================= Concise Info ================================= 56: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 56: 0 41.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 1 53.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: 2 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: 4 40.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: 6 42.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: ================================================================================ 56: ============================= End of ROCm SMI Log ============================== 36: START 2068237: Fri Nov 25 16:59:59 EET 2022 22: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 22: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 22: START 2068237: Fri Nov 25 16:59:59 EET 2022 37: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 37: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 37: START 2068237: Fri Nov 25 16:59:59 EET 2022 31: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 31: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 31: START 2068237: Fri Nov 25 16:59:59 EET 2022 38: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 38: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 38: START 2068237: Fri Nov 25 16:59:59 EET 2022 39: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 39: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 39: START 2068237: Fri Nov 25 16:59:59 EET 2022 25: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 25: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 34: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 25: START 2068237: Fri Nov 25 16:59:59 EET 2022 34: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 34: START 2068237: Fri Nov 25 16:59:59 EET 2022 46: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 46: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 46: START 2068237: Fri Nov 25 16:59:59 EET 2022 49: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 49: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 44: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 44: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 49: START 2068237: Fri Nov 25 16:59:59 EET 2022 44: START 2068237: Fri Nov 25 16:59:59 EET 2022 35: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 35: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 35: START 2068237: Fri Nov 25 16:59:59 EET 2022 47: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 47: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 41: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 47: START 2068237: Fri Nov 25 16:59:59 EET 2022 41: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 41: START 2068237: Fri Nov 25 16:59:59 EET 2022 50: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 50: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 50: START 2068237: Fri Nov 25 16:59:59 EET 2022 42: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 42: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 42: START 2068237: Fri Nov 25 16:59:59 EET 2022 48: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 48: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 48: START 2068237: Fri Nov 25 16:59:59 EET 2022 45: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 45: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 45: START 2068237: Fri Nov 25 16:59:59 EET 2022 3: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 3: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 51: 51: 51: ======================= ROCm System Management Interface ======================= 51: ================================= Concise Info ================================= 51: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 51: 0 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: 2 44.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: 4 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: 6 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: ================================================================================ 51: ============================= End of ROCm SMI Log ============================== 2: 2: 2: ======================= ROCm System Management Interface ======================= 2: ================================= Concise Info ================================= 2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 2: 0 50.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: 2 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 3 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: 4 42.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: 6 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: ================================================================================ 2: ============================= End of ROCm SMI Log ============================== 3: START 2068237: Fri Nov 25 16:59:59 EET 2022 43: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 43: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 54: 54: 54: ======================= ROCm System Management Interface ======================= 54: ================================= Concise Info ================================= 54: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 54: 0 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: 2 40.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: 4 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: 6 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: ================================================================================ 54: ============================= End of ROCm SMI Log ============================== 43: START 2068237: Fri Nov 25 16:59:59 EET 2022 7: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 36 --hidden-size 2944 --num-attention-heads 23 --kv-channels 128 --ffn-hidden-size 11776 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 1 --global-batch-size 512 --train-samples 12_505_484 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-3b9 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 12_505_484 --lr-warmup-samples 125_055 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_3b9 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_3b9 --load checkpoints_3b9 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data 7: -impl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2068237.json --zero-stage 0 7: START 2068237: Fri Nov 25 16:59:59 EET 2022 16: 16: 16: ======================= ROCm System Management Interface ======================= 16: ================================= Concise Info ================================= 16: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 16: 0 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: 2 37.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: 4 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: 6 42.0c 78.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: ================================================================================ 16: ============================= End of ROCm SMI Log ============================== 62: 62: 62: ======================= ROCm System Management Interface ======================= 62: ================================= Concise Info ================================= 62: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 62: 0 50.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: 2 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: 4 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: 6 41.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: ================================================================================ 62: ============================= End of ROCm SMI Log ============================== 58: 58: 58: ======================= ROCm System Management Interface ======================= 58: ================================= Concise Info ================================= 58: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 58: 0 40.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: 2 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: 4 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: 6 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: ================================================================================ 58: ============================= End of ROCm SMI Log ============================== 57: 57: 57: ======================= ROCm System Management Interface ======================= 57: ================================= Concise Info ================================= 57: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 57: 0 48.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: 2 36.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: 4 36.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: 6 36.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: ================================================================================ 57: ============================= End of ROCm SMI Log ============================== 61: 61: 61: ======================= ROCm System Management Interface ======================= 61: ================================= Concise Info ================================= 61: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 61: 0 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: 2 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: 6 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: ================================================================================ 61: ============================= End of ROCm SMI Log ============================== 55: 55: 55: ======================= ROCm System Management Interface ======================= 55: ================================= Concise Info ================================= 55: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 55: 0 45.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: 2 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: 4 46.0c 78.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: ================================================================================ 55: ============================= End of ROCm SMI Log ============================== 4: 4: 4: ======================= ROCm System Management Interface ======================= 4: ================================= Concise Info ================================= 4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 4: 0 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: 2 41.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 3 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: 4 38.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: 6 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: ================================================================================ 4: ============================= End of ROCm SMI Log ============================== 53: 53: 53: ======================= ROCm System Management Interface ======================= 53: ================================= Concise Info ================================= 53: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 53: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: 2 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: 4 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: 6 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: ================================================================================ 53: ============================= End of ROCm SMI Log ============================== 59: 59: 59: ======================= ROCm System Management Interface ======================= 59: ================================= Concise Info ================================= 59: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 59: 0 39.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: 2 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: 4 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: 6 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: ================================================================================ 59: ============================= End of ROCm SMI Log ============================== 0: 0: 0: ======================= ROCm System Management Interface ======================= 0: ================================= Concise Info ================================= 0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 0: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: 4 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 5 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: 6 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: ================================================================================ 0: ============================= End of ROCm SMI Log ============================== 63: 63: 63: ======================= ROCm System Management Interface ======================= 63: ================================= Concise Info ================================= 63: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 63: 0 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: 4 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: 6 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: ================================================================================ 63: ============================= End of ROCm SMI Log ============================== 10: 10: 10: ======================= ROCm System Management Interface ======================= 10: ================================= Concise Info ================================= 10: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 10: 0 43.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: 2 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: 4 36.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: 6 35.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: ================================================================================ 10: ============================= End of ROCm SMI Log ============================== 21: 21: 21: ======================= ROCm System Management Interface ======================= 21: ================================= Concise Info ================================= 21: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 21: 0 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: 2 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: 4 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: 6 38.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: ================================================================================ 21: ============================= End of ROCm SMI Log ============================== 9: 9: 9: ======================= ROCm System Management Interface ======================= 9: ================================= Concise Info ================================= 9: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 9: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 3 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: 4 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: 6 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: ================================================================================ 9: ============================= End of ROCm SMI Log ============================== 12: 12: 12: ======================= ROCm System Management Interface ======================= 12: ================================= Concise Info ================================= 12: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 12: 0 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: 2 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: 4 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: ================================================================================ 12: ============================= End of ROCm SMI Log ============================== 5: 5: 5: ======================= ROCm System Management Interface ======================= 5: ================================= Concise Info ================================= 5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 5: 0 50.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: 2 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: 4 41.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: 6 41.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: ================================================================================ 5: ============================= End of ROCm SMI Log ============================== 17: 17: 17: ======================= ROCm System Management Interface ======================= 17: ================================= Concise Info ================================= 17: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 17: 0 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: 2 41.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: 4 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: 6 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: ================================================================================ 17: ============================= End of ROCm SMI Log ============================== 33: 33: 33: ======================= ROCm System Management Interface ======================= 33: ================================= Concise Info ================================= 33: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 33: 0 36.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: 2 41.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: 4 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: 6 40.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 7 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: ================================================================================ 33: ============================= End of ROCm SMI Log ============================== 19: 19: 19: ======================= ROCm System Management Interface ======================= 19: ================================= Concise Info ================================= 19: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 19: 0 45.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: 2 38.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: 4 42.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: 6 43.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: ================================================================================ 19: ============================= End of ROCm SMI Log ============================== 15: 15: 15: ======================= ROCm System Management Interface ======================= 15: ================================= Concise Info ================================= 15: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 15: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: 2 41.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: 4 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: 6 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: ================================================================================ 15: ============================= End of ROCm SMI Log ============================== 8: 8: 8: ======================= ROCm System Management Interface ======================= 8: ================================= Concise Info ================================= 8: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 8: 0 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: 2 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 3 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: 4 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: 6 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: ================================================================================ 8: ============================= End of ROCm SMI Log ============================== 11: 11: 11: ======================= ROCm System Management Interface ======================= 11: ================================= Concise Info ================================= 11: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 11: 0 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: 2 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: ================================================================================ 11: ============================= End of ROCm SMI Log ============================== 1: 1: 1: ======================= ROCm System Management Interface ======================= 1: ================================= Concise Info ================================= 1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 1: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: 2 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: 4 47.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: 6 48.0c 78.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: ================================================================================ 1: ============================= End of ROCm SMI Log ============================== 30: 30: 30: ======================= ROCm System Management Interface ======================= 30: ================================= Concise Info ================================= 30: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 30: 0 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: 2 44.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: 4 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: 6 37.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: ================================================================================ 30: ============================= End of ROCm SMI Log ============================== 14: 14: 14: ======================= ROCm System Management Interface ======================= 14: ================================= Concise Info ================================= 14: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 14: 0 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: 2 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: 4 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: 6 51.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: ================================================================================ 14: ============================= End of ROCm SMI Log ============================== 13: 13: 13: ======================= ROCm System Management Interface ======================= 13: ================================= Concise Info ================================= 13: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 13: 0 46.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: 2 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: 4 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: 6 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: ================================================================================ 13: ============================= End of ROCm SMI Log ============================== 28: 28: 28: ======================= ROCm System Management Interface ======================= 28: ================================= Concise Info ================================= 28: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 28: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: 2 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: 6 45.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: ================================================================================ 28: ============================= End of ROCm SMI Log ============================== 18: 18: 18: ======================= ROCm System Management Interface ======================= 18: ================================= Concise Info ================================= 18: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 18: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: 2 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: 4 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: 6 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: ================================================================================ 18: ============================= End of ROCm SMI Log ============================== 27: 27: 27: ======================= ROCm System Management Interface ======================= 27: ================================= Concise Info ================================= 27: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 27: 0 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: 2 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: 4 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: 6 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: ================================================================================ 27: ============================= End of ROCm SMI Log ============================== 20: 20: 20: ======================= ROCm System Management Interface ======================= 20: ================================= Concise Info ================================= 20: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 20: 0 36.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: 2 37.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: 4 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: 6 39.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: ================================================================================ 20: ============================= End of ROCm SMI Log ============================== 29: 29: 29: ======================= ROCm System Management Interface ======================= 29: ================================= Concise Info ================================= 29: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 29: 0 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 1 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: 2 38.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: 4 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: 6 37.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: ================================================================================ 29: ============================= End of ROCm SMI Log ============================== 24: 24: 24: ======================= ROCm System Management Interface ======================= 24: ================================= Concise Info ================================= 24: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 24: 0 47.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: 2 38.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: 4 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: 6 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: ================================================================================ 24: ============================= End of ROCm SMI Log ============================== 6: 6: 6: ======================= ROCm System Management Interface ======================= 6: ================================= Concise Info ================================= 6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 6: 0 45.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: 2 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: 4 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: 6 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: ================================================================================ 6: ============================= End of ROCm SMI Log ============================== 23: 23: 23: ======================= ROCm System Management Interface ======================= 23: ================================= Concise Info ================================= 23: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 23: 0 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: 2 39.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: 4 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: 6 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: ================================================================================ 23: ============================= End of ROCm SMI Log ============================== 26: 26: 26: ======================= ROCm System Management Interface ======================= 26: ================================= Concise Info ================================= 26: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 26: 0 46.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: 2 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: 4 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: 6 41.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: ================================================================================ 26: ============================= End of ROCm SMI Log ============================== 32: 32: 32: ======================= ROCm System Management Interface ======================= 32: ================================= Concise Info ================================= 32: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 32: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: 2 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: 4 49.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: 6 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: ================================================================================ 32: ============================= End of ROCm SMI Log ============================== 22: 22: 22: ======================= ROCm System Management Interface ======================= 22: ================================= Concise Info ================================= 22: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 22: 0 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: 2 41.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: 4 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: 6 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: ================================================================================ 22: ============================= End of ROCm SMI Log ============================== 36: 36: 36: ======================= ROCm System Management Interface ======================= 36: ================================= Concise Info ================================= 36: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 36: 0 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: 2 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: 4 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: 6 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: ================================================================================ 36: ============================= End of ROCm SMI Log ============================== 52: 52: 52: ======================= ROCm System Management Interface ======================= 52: ================================= Concise Info ================================= 52: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 52: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: 2 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: 4 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: 6 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 7 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: ================================================================================ 52: ============================= End of ROCm SMI Log ============================== 37: 37: 37: ======================= ROCm System Management Interface ======================= 37: ================================= Concise Info ================================= 37: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 37: 0 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: 2 42.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: 4 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: 6 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: ================================================================================ 37: ============================= End of ROCm SMI Log ============================== 31: 31: 31: ======================= ROCm System Management Interface ======================= 31: ================================= Concise Info ================================= 31: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 31: 0 49.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: 2 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: 4 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: 6 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: ================================================================================ 31: ============================= End of ROCm SMI Log ============================== 38: 38: 38: ======================= ROCm System Management Interface ======================= 38: ================================= Concise Info ================================= 38: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 38: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: 2 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: 4 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: 6 36.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: ================================================================================ 38: ============================= End of ROCm SMI Log ============================== 39: 39: 39: ======================= ROCm System Management Interface ======================= 39: ================================= Concise Info ================================= 39: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 39: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: 2 36.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: 6 41.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: ================================================================================ 39: ============================= End of ROCm SMI Log ============================== 25: 25: 25: ======================= ROCm System Management Interface ======================= 25: ================================= Concise Info ================================= 25: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 25: 0 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: 2 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: 4 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: 6 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: ================================================================================ 25: ============================= End of ROCm SMI Log ============================== 34: 34: 34: ======================= ROCm System Management Interface ======================= 34: ================================= Concise Info ================================= 34: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 34: 0 48.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: 2 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: 4 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: 6 42.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: ================================================================================ 34: ============================= End of ROCm SMI Log ============================== 46: 46: 46: ======================= ROCm System Management Interface ======================= 46: ================================= Concise Info ================================= 46: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 46: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: 2 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: 4 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: 6 45.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: ================================================================================ 46: ============================= End of ROCm SMI Log ============================== 44: 44: 44: ======================= ROCm System Management Interface ======================= 44: ================================= Concise Info ================================= 44: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 44: 0 51.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: 2 42.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: 4 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: 6 44.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: ================================================================================ 44: ============================= End of ROCm SMI Log ============================== 49: 49: 49: ======================= ROCm System Management Interface ======================= 49: ================================= Concise Info ================================= 49: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 49: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: 2 48.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: 4 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: 6 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: ================================================================================ 49: ============================= End of ROCm SMI Log ============================== 47: 47: 47: ======================= ROCm System Management Interface ======================= 47: ================================= Concise Info ================================= 47: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 47: 0 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: 2 40.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: 4 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: 6 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: ================================================================================ 47: ============================= End of ROCm SMI Log ============================== 35: 35: 35: ======================= ROCm System Management Interface ======================= 35: ================================= Concise Info ================================= 35: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 35: 0 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: 2 47.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: 4 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: 6 36.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: ================================================================================ 35: ============================= End of ROCm SMI Log ============================== 41: 41: 41: ======================= ROCm System Management Interface ======================= 41: ================================= Concise Info ================================= 41: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 41: 0 52.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: 2 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: 4 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: 6 47.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: ================================================================================ 41: ============================= End of ROCm SMI Log ============================== 50: 50: 50: ======================= ROCm System Management Interface ======================= 50: ================================= Concise Info ================================= 50: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 50: 0 44.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: 2 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: 4 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: ================================================================================ 50: ============================= End of ROCm SMI Log ============================== 42: 42: 42: ======================= ROCm System Management Interface ======================= 42: ================================= Concise Info ================================= 42: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 42: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: 2 42.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: 4 43.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: 6 38.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: ================================================================================ 42: ============================= End of ROCm SMI Log ============================== 48: 48: 48: ======================= ROCm System Management Interface ======================= 48: ================================= Concise Info ================================= 48: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 48: 0 43.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: 2 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: 6 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: ================================================================================ 48: ============================= End of ROCm SMI Log ============================== 45: 45: 45: ======================= ROCm System Management Interface ======================= 45: ================================= Concise Info ================================= 45: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 45: 0 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 1 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: 2 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: 4 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: 6 42.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: ================================================================================ 45: ============================= End of ROCm SMI Log ============================== 3: 3: 3: ======================= ROCm System Management Interface ======================= 3: ================================= Concise Info ================================= 3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 3: 0 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: 2 39.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: 4 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: 6 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: ================================================================================ 3: ============================= End of ROCm SMI Log ============================== 43: 43: 43: ======================= ROCm System Management Interface ======================= 43: ================================= Concise Info ================================= 43: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 43: 0 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: 2 44.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: 4 41.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: 6 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: ================================================================================ 43: ============================= End of ROCm SMI Log ============================== 7: 7: 7: ======================= ROCm System Management Interface ======================= 7: ================================= Concise Info ================================= 7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 7: 0 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: 2 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: 4 38.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: 6 37.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: ================================================================================ 7: ============================= End of ROCm SMI Log ============================== 40: 40: 40: ======================= ROCm System Management Interface ======================= 40: ================================= Concise Info ================================= 40: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 40: 0 44.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: 2 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: 4 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: 6 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: ================================================================================ 40: ============================= End of ROCm SMI Log ============================== 51: Launching on nid006191 (51/64), master nid005462 port 9999, GPUs 8, CUDA: True 62: Launching on nid006237 (62/64), master nid005462 port 9999, GPUs 8, CUDA: True 16: Launching on nid006078 (16/64), master nid005462 port 9999, GPUs 8, CUDA: True 56: Launching on nid006220 (56/64), master nid005462 port 9999, GPUs 8, CUDA: True 61: Launching on nid006236 (61/64), master nid005462 port 9999, GPUs 8, CUDA: True 57: Launching on nid006221 (57/64), master nid005462 port 9999, GPUs 8, CUDA: True 54: Launching on nid006194 (54/64), master nid005462 port 9999, GPUs 8, CUDA: True 4: Launching on nid005506 (4/64), master nid005462 port 9999, GPUs 8, CUDA: True 13: Launching on nid006013 (13/64), master nid005462 port 9999, GPUs 8, CUDA: True 10: Launching on nid006010 (10/64), master nid005462 port 9999, GPUs 8, CUDA: True 9: Launching on nid006009 (9/64), master nid005462 port 9999, GPUs 8, CUDA: True 53: Launching on nid006193 (53/64), master nid005462 port 9999, GPUs 8, CUDA: True 63: Launching on nid006238 (63/64), master nid005462 port 9999, GPUs 8, CUDA: True 21: Launching on nid006088 (21/64), master nid005462 port 9999, GPUs 8, CUDA: True 12: Launching on nid006012 (12/64), master nid005462 port 9999, GPUs 8, CUDA: True 0: Launching on nid005462 (0/64), master nid005462 port 9999, GPUs 8, CUDA: True 17: Launching on nid006079 (17/64), master nid005462 port 9999, GPUs 8, CUDA: True 14: Launching on nid006076 (14/64), master nid005462 port 9999, GPUs 8, CUDA: True 5: Launching on nid005507 (5/64), master nid005462 port 9999, GPUs 8, CUDA: True 27: Launching on nid006094 (27/64), master nid005462 port 9999, GPUs 8, CUDA: True 19: Launching on nid006081 (19/64), master nid005462 port 9999, GPUs 8, CUDA: True 2: Launching on nid005464 (2/64), master nid005462 port 9999, GPUs 8, CUDA: True 15: Launching on nid006077 (15/64), master nid005462 port 9999, GPUs 8, CUDA: True 23: Launching on nid006090 (23/64), master nid005462 port 9999, GPUs 8, CUDA: True 42: Launching on nid006176 (42/64), master nid005462 port 9999, GPUs 8, CUDA: True 60: Launching on nid006235 (60/64), master nid005462 port 9999, GPUs 8, CUDA: True 58: Launching on nid006222 (58/64), master nid005462 port 9999, GPUs 8, CUDA: True 37: Launching on nid006115 (37/64), master nid005462 port 9999, GPUs 8, CUDA: True 26: Launching on nid006093 (26/64), master nid005462 port 9999, GPUs 8, CUDA: True 36: Launching on nid006114 (36/64), master nid005462 port 9999, GPUs 8, CUDA: True 22: Launching on nid006089 (22/64), master nid005462 port 9999, GPUs 8, CUDA: True 32: Launching on nid006109 (32/64), master nid005462 port 9999, GPUs 8, CUDA: True 39: Launching on nid006117 (39/64), master nid005462 port 9999, GPUs 8, CUDA: True 35: Launching on nid006112 (35/64), master nid005462 port 9999, GPUs 8, CUDA: True 1: Launching on nid005463 (1/64), master nid005462 port 9999, GPUs 8, CUDA: True 34: Launching on nid006111 (34/64), master nid005462 port 9999, GPUs 8, CUDA: True 44: Launching on nid006178 (44/64), master nid005462 port 9999, GPUs 8, CUDA: True 11: Launching on nid006011 (11/64), master nid005462 port 9999, GPUs 8, CUDA: True 31: Launching on nid006108 (31/64), master nid005462 port 9999, GPUs 8, CUDA: True 25: Launching on nid006092 (25/64), master nid005462 port 9999, GPUs 8, CUDA: True 48: Launching on nid006188 (48/64), master nid005462 port 9999, GPUs 8, CUDA: True 41: Launching on nid006175 (41/64), master nid005462 port 9999, GPUs 8, CUDA: True 6: Launching on nid005782 (6/64), master nid005462 port 9999, GPUs 8, CUDA: True 43: Launching on nid006177 (43/64), master nid005462 port 9999, GPUs 8, CUDA: True 50: Launching on nid006190 (50/64), master nid005462 port 9999, GPUs 8, CUDA: True 29: Launching on nid006106 (29/64), master nid005462 port 9999, GPUs 8, CUDA: True 28: Launching on nid006105 (28/64), master nid005462 port 9999, GPUs 8, CUDA: True 24: Launching on nid006091 (24/64), master nid005462 port 9999, GPUs 8, CUDA: True 52: Launching on nid006192 (52/64), master nid005462 port 9999, GPUs 8, CUDA: True 45: Launching on nid006179 (45/64), master nid005462 port 9999, GPUs 8, CUDA: True 38: Launching on nid006116 (38/64), master nid005462 port 9999, GPUs 8, CUDA: True 33: Launching on nid006110 (33/64), master nid005462 port 9999, GPUs 8, CUDA: True 30: Launching on nid006107 (30/64), master nid005462 port 9999, GPUs 8, CUDA: True 7: Launching on nid005783 (7/64), master nid005462 port 9999, GPUs 8, CUDA: True 49: Launching on nid006189 (49/64), master nid005462 port 9999, GPUs 8, CUDA: True 8: Launching on nid006008 (8/64), master nid005462 port 9999, GPUs 8, CUDA: True 59: Launching on nid006223 (59/64), master nid005462 port 9999, GPUs 8, CUDA: True 3: Launching on nid005505 (3/64), master nid005462 port 9999, GPUs 8, CUDA: True 40: Launching on nid006174 (40/64), master nid005462 port 9999, GPUs 8, CUDA: True 46: Launching on nid006180 (46/64), master nid005462 port 9999, GPUs 8, CUDA: True 18: Launching on nid006080 (18/64), master nid005462 port 9999, GPUs 8, CUDA: True 55: Launching on nid006219 (55/64), master nid005462 port 9999, GPUs 8, CUDA: True 20: Launching on nid006082 (20/64), master nid005462 port 9999, GPUs 8, CUDA: True 47: Launching on nid006181 (47/64), master nid005462 port 9999, GPUs 8, CUDA: True 0: using world size: 512, data-parallel-size: 256, tensor-model-parallel size: 1, pipeline-model-parallel size: 2 0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. 0: using torch.bfloat16 for parameters ... 0: ------------------------ arguments ------------------------ 0: abort_on_unmet_fused_kernel_constraints ......... False 0: accumulate_allreduce_grads_in_fp32 .............. True 0: adam_beta1 ...................................... 0.9 0: adam_beta2 ...................................... 0.999 0: adam_eps ........................................ 1e-08 0: adlr_autoresume ................................. False 0: adlr_autoresume_interval ........................ 1000 0: apply_query_key_layer_scaling ................... True 0: apply_residual_connection_post_layernorm ........ False 0: attention_dropout ............................... 0.1 0: attention_softmax_in_fp32 ....................... False 0: bert_binary_head ................................ True 0: bert_load ....................................... None 0: bf16 ............................................ True 0: bias_dropout_fusion ............................. True 0: bias_gelu_fusion ................................ True 0: biencoder_projection_dim ........................ 0 0: biencoder_shared_query_context_model ............ False 0: block_data_path ................................. None 0: checkpoint_activations .......................... False 0: checkpoint_in_cpu ............................... False 0: checkpoint_num_layers ........................... 1 0: clip_grad ....................................... 1.0 0: codecarbon_dir .................................. None 0: consumed_train_samples .......................... 0 0: consumed_train_tokens ........................... 0 0: consumed_valid_samples .......................... 0 0: contigious_checkpointing ........................ False 0: cpu_optimizer ................................... False 0: cpu_torch_adam .................................. False 0: curriculum_learning ............................. False 0: data_impl ....................................... mmap 0: data_parallel_size .............................. 256 0: data_path ....................................... ['/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document'] 0: dataloader_type ................................. single 0: DDP_impl ........................................ local 0: decoder_seq_length .............................. None 0: deepscale ....................................... False 0: deepscale_config ................................ None 0: deepspeed ....................................... True 0: deepspeed_activation_checkpointing .............. False 0: deepspeed_config ................................ ds_configs/2068237.json 0: deepspeed_mpi ................................... False 0: distribute_checkpointed_activations ............. False 0: distributed_backend ............................. nccl 0: embed_layernorm ................................. False 0: embedding_path .................................. None 0: encoder_seq_length .............................. 2048 0: eod_mask_loss ................................... False 0: eval_interval ................................... 1000 0: eval_iters ...................................... 1 0: eval_only ....................................... None 0: evidence_data_path .............................. None 0: exit_duration_in_mins ........................... None 0: exit_interval ................................... None 0: ffn_hidden_size ................................. 11776 0: finetune ........................................ False 0: fp16 ............................................ False 0: fp16_lm_cross_entropy ........................... False 0: fp32_residual_connection ........................ False 0: gigaflos_no_embeds .............................. 0 0: global_batch_size ............................... 512 0: glu_activation .................................. None 0: hidden_dropout .................................. 0.1 0: hidden_size ..................................... 2944 0: hysteresis ...................................... 2 0: ict_head_size ................................... None 0: ict_load ........................................ None 0: img_dim ......................................... 224 0: indexer_batch_size .............................. 128 0: indexer_log_interval ............................ 1000 0: inference ....................................... False 0: init_method_std ................................. 0.02 0: init_method_xavier_uniform ...................... False 0: initial_loss_scale .............................. 4294967296 0: kill_switch_path ................................ kill-switch-3b9 0: kv_channels ..................................... 128 0: layer_norm_fusion ............................... True 0: layernorm_epsilon ............................... 1e-05 0: lazy_mpu_init ................................... None 0: load ............................................ checkpoints_3b9 0: local_rank ...................................... None 0: log_batch_size_to_tensorboard ................... True 0: log_interval .................................... 10 0: log_learning_rate_to_tensorboard ................ True 0: log_level ....................................... None 0: log_level_replica ............................... None 0: log_loss_scale_to_tensorboard ................... True 0: log_num_zeros_in_grad ........................... False 0: log_params_norm ................................. False 0: log_path ........................................ None 0: log_timers_to_tensorboard ....................... True 0: log_validation_ppl_to_tensorboard ............... True 0: loss_on_targets_only ............................ False 0: loss_scale ...................................... None 0: loss_scale_window ............................... 1000 0: lr .............................................. 0.0002 0: lr_decay_iters .................................. None 0: lr_decay_samples ................................ 12505484 0: lr_decay_style .................................. cosine 0: lr_decay_tokens ................................. None 0: lr_warmup_fraction .............................. None 0: lr_warmup_iters ................................. 0 0: lr_warmup_samples ............................... 125055 0: make_vocab_size_divisible_by .................... 128 0: mask_prob ....................................... 0.15 0: masked_softmax_fusion ........................... True 0: max_position_embeddings ......................... 2048 0: mean_noise_span_length .......................... None 0: memory_centric_tiled_linear ..................... False 0: merge_file ...................................... gpt2/merges.txt 0: micro_batch_size ................................ 1 0: min_loss_scale .................................. 1.0 0: min_lr .......................................... 2e-05 0: mmap_warmup ..................................... False 0: no_load_optim ................................... None 0: no_load_rng ..................................... None 0: no_save_optim ................................... None 0: no_save_rng ..................................... None 0: noise_density ................................... None 0: num_attention_heads ............................. 23 0: num_channels .................................... 3 0: num_classes ..................................... 1000 0: num_layers ...................................... 36 0: num_layers_per_virtual_pipeline_stage ........... None 0: num_workers ..................................... 2 0: onnx_safe ....................................... None 0: openai_gelu ..................................... False 0: optimizer ....................................... adam 0: optimizer_fusion ................................ True 0: override_lr_scheduler ........................... False 0: pad_vocab_size_to ............................... None 0: params_dtype .................................... torch.bfloat16 0: partition_activations ........................... False 0: patch_dim ....................................... 16 0: pipeline_model_parallel_size .................... 2 0: position_embedding_type ......................... PositionEmbeddingType.absolute 0: pp_partition_method ............................. None 0: profile_backward ................................ False 0: query_in_block_prob ............................. 0.1 0: rampup_batch_size ............................... None 0: rank ............................................ 0 0: remote_device ................................... none 0: reset_attention_mask ............................ False 0: reset_position_ids .............................. False 0: retriever_report_topk_accuracies ................ [] 0: retriever_score_scaling ......................... False 0: retriever_seq_length ............................ 256 0: reweight_loss_based_on_position_frequency ....... False 0: sample_rate ..................................... 1.0 0: save ............................................ checkpoints_3b9 0: save_interval ................................... 1000 0: scatter_gather_tensors_in_pipeline .............. True 0: scattered_embeddings ............................ False 0: seed ............................................ 1234 0: seq_length ...................................... 2048 0: sgd_momentum .................................... 0.9 0: short_seq_prob .................................. 0.1 0: skip_train_iteration_range ...................... None 0: split ........................................... 949,50,1 0: split_transformers .............................. False 0: sync_tp_duplicated_parameters ................... False 0: synchronize_each_layer .......................... False 0: tensor_model_parallel_size ...................... 1 0: tensorboard_dir ................................. tensorboard_3b9 0: tensorboard_log_interval ........................ 1 0: tensorboard_queue_size .......................... 5 0: test_weighted_split_names ....................... None 0: test_weighted_split_paths ....................... None 0: test_weighted_split_paths_path .................. None 0: test_weighted_split_splits ...................... None 0: test_weighted_split_weights ..................... None 0: tile_factor ..................................... 1 0: titles_data_path ................................ None 0: tokenizer_name_or_path .......................... None 0: tokenizer_type .................................. GPT2BPETokenizer 0: train_iters ..................................... None 0: train_samples ................................... 12505484 0: train_tokens .................................... None 0: train_weighted_split_paths ...................... None 0: train_weighted_split_paths_path ................. None 0: universal_checkpoint ............................ False 0: use_bnb_optimizer ............................... False 0: use_checkpoint_lr_scheduler ..................... False 0: use_contiguous_buffers_in_ddp ................... True 0: use_cpu_initialization .......................... None 0: use_one_sent_docs ............................... False 0: use_pin_memory .................................. False 0: valid_num_workers ............................... 2 0: valid_weighted_split_names ...................... None 0: valid_weighted_split_paths ...................... None 0: valid_weighted_split_paths_path ................. None 0: valid_weighted_split_splits ..................... None 0: valid_weighted_split_weights .................... None 0: virtual_pipeline_model_parallel_size ............ None 0: vocab_extra_ids ................................. 0 0: vocab_file ...................................... gpt2/vocab.json 0: weight_decay .................................... 0.1 0: world_size ...................................... 512 0: zero_allgather_bucket_size ...................... 0.0 0: zero_contigious_gradients ....................... False 0: zero_reduce_bucket_size ......................... 0.0 0: zero_reduce_scatter ............................. False 0: zero_stage ...................................... 0 0: -------------------- end of arguments --------------------- 0: setting number of micro-batches to constant 2 0: > building GPT2BPETokenizer tokenizer ... 0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) 0: DeepSpeed general environment info: 0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] 0: torch version .................... 1.13.0+rocm5.2 0: torch cuda version ............... None 0: torch hip version ................ 5.2.21151-afdc89f8 0: nvcc version ..................... None 0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] 0: deepspeed info ................... 0.7.5, unknown, unknown 0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** 0: > initializing torch distributed ... 0: [2022-11-25 17:00:09,210] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl 63: > setting tensorboard ... 0: > initializing tensor model parallel with size 1 0: > initializing pipeline model parallel with size 2 0: > setting random seeds to 1234 ... 0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 0: > compiling dataset index builder ... 0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' 0: make: Nothing to be done for 'default'. 0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' 0: >>> done with dataset index builder. Compilation time: 0.060 seconds 0: WARNING: constraints for invoking optimized fused softmax kernel are not met. We default back to unfused kernel invocations. 0: > compiling and loading fused kernels ... 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] 0: Total number of unsupported CUDA function calls: 0 0: 0: 0: Total number of replaced kernel launches: 87 0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] 0: Total number of unsupported CUDA function calls: 0 0: 0: 0: Total number of replaced kernel launches: 63 0: ninja: no work to do. 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] 0: Total number of unsupported CUDA function calls: 0 0: 0: 0: Total number of replaced kernel launches: 67 0: [1/1] c++ layer_norm_hip_kernel.cuda.o layer_norm_cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so 0: >>> done with compiling and loading fused kernels. Compilation time: 20.227 seconds 0: time to initialize megatron (seconds): 37.670 0: [after megatron is initialized] datetime: 2022-11-25 17:00:53 0: building GPT model ... 0: [2022-11-25 17:00:53,722] [INFO] [utils.py:827:see_memory_usage] Before Building Model 0: [2022-11-25 17:00:53,723] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB 0: [2022-11-25 17:00:53,723] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.99 GB, percent = 7.1% 0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None 0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi 0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63, ProcessCoord(pipe=0, data=64, model=0): 64, ProcessCoord(pipe=0, data=65, model=0): 65, ProcessCoord(pipe=0, data=66, model=0): 66, ProcessCoord(pipe=0, data=67, model=0): 67, ProcessCoord(pipe=0, data=68, model=0): 68, ProcessCoord(pipe=0, data=69, model=0): 0: 69, ProcessCoord(pipe=0, data=70, model=0): 70, ProcessCoord(pipe=0, data=71, model=0): 71, ProcessCoord(pipe=0, data=72, model=0): 72, ProcessCoord(pipe=0, data=73, model=0): 73, ProcessCoord(pipe=0, data=74, model=0): 74, ProcessCoord(pipe=0, data=75, model=0): 75, ProcessCoord(pipe=0, data=76, model=0): 76, ProcessCoord(pipe=0, data=77, model=0): 77, ProcessCoord(pipe=0, data=78, model=0): 78, ProcessCoord(pipe=0, data=79, model=0): 79, ProcessCoord(pipe=0, data=80, model=0): 80, ProcessCoord(pipe=0, data=81, model=0): 81, ProcessCoord(pipe=0, data=82, model=0): 82, ProcessCoord(pipe=0, data=83, model=0): 83, ProcessCoord(pipe=0, data=84, model=0): 84, ProcessCoord(pipe=0, data=85, model=0): 85, ProcessCoord(pipe=0, data=86, model=0): 86, ProcessCoord(pipe=0, data=87, model=0): 87, ProcessCoord(pipe=0, data=88, model=0): 88, ProcessCoord(pipe=0, data=89, model=0): 89, ProcessCoord(pipe=0, data=90, model=0): 90, ProcessCoord(pipe=0, data=91, model=0): 91, ProcessCoord(pipe=0, data=92, model=0): 92, Process 0: Coord(pipe=0, data=93, model=0): 93, ProcessCoord(pipe=0, data=94, model=0): 94, ProcessCoord(pipe=0, data=95, model=0): 95, ProcessCoord(pipe=0, data=96, model=0): 96, ProcessCoord(pipe=0, data=97, model=0): 97, ProcessCoord(pipe=0, data=98, model=0): 98, ProcessCoord(pipe=0, data=99, model=0): 99, ProcessCoord(pipe=0, data=100, model=0): 100, ProcessCoord(pipe=0, data=101, model=0): 101, ProcessCoord(pipe=0, data=102, model=0): 102, ProcessCoord(pipe=0, data=103, model=0): 103, ProcessCoord(pipe=0, data=104, model=0): 104, ProcessCoord(pipe=0, data=105, model=0): 105, ProcessCoord(pipe=0, data=106, model=0): 106, ProcessCoord(pipe=0, data=107, model=0): 107, ProcessCoord(pipe=0, data=108, model=0): 108, ProcessCoord(pipe=0, data=109, model=0): 109, ProcessCoord(pipe=0, data=110, model=0): 110, ProcessCoord(pipe=0, data=111, model=0): 111, ProcessCoord(pipe=0, data=112, model=0): 112, ProcessCoord(pipe=0, data=113, model=0): 113, ProcessCoord(pipe=0, data=114, model=0): 114, ProcessCoord(pipe=0, data=115, mo 0: del=0): 115, ProcessCoord(pipe=0, data=116, model=0): 116, ProcessCoord(pipe=0, data=117, model=0): 117, ProcessCoord(pipe=0, data=118, model=0): 118, ProcessCoord(pipe=0, data=119, model=0): 119, ProcessCoord(pipe=0, data=120, model=0): 120, ProcessCoord(pipe=0, data=121, model=0): 121, ProcessCoord(pipe=0, data=122, model=0): 122, ProcessCoord(pipe=0, data=123, model=0): 123, ProcessCoord(pipe=0, data=124, model=0): 124, ProcessCoord(pipe=0, data=125, model=0): 125, ProcessCoord(pipe=0, data=126, model=0): 126, ProcessCoord(pipe=0, data=127, model=0): 127, ProcessCoord(pipe=0, data=128, model=0): 128, ProcessCoord(pipe=0, data=129, model=0): 129, ProcessCoord(pipe=0, data=130, model=0): 130, ProcessCoord(pipe=0, data=131, model=0): 131, ProcessCoord(pipe=0, data=132, model=0): 132, ProcessCoord(pipe=0, data=133, model=0): 133, ProcessCoord(pipe=0, data=134, model=0): 134, ProcessCoord(pipe=0, data=135, model=0): 135, ProcessCoord(pipe=0, data=136, model=0): 136, ProcessCoord(pipe=0, data=137, model=0): 137, 0: ProcessCoord(pipe=0, data=138, model=0): 138, ProcessCoord(pipe=0, data=139, model=0): 139, ProcessCoord(pipe=0, data=140, model=0): 140, ProcessCoord(pipe=0, data=141, model=0): 141, ProcessCoord(pipe=0, data=142, model=0): 142, ProcessCoord(pipe=0, data=143, model=0): 143, ProcessCoord(pipe=0, data=144, model=0): 144, ProcessCoord(pipe=0, data=145, model=0): 145, ProcessCoord(pipe=0, data=146, model=0): 146, ProcessCoord(pipe=0, data=147, model=0): 147, ProcessCoord(pipe=0, data=148, model=0): 148, ProcessCoord(pipe=0, data=149, model=0): 149, ProcessCoord(pipe=0, data=150, model=0): 150, ProcessCoord(pipe=0, data=151, model=0): 151, ProcessCoord(pipe=0, data=152, model=0): 152, ProcessCoord(pipe=0, data=153, model=0): 153, ProcessCoord(pipe=0, data=154, model=0): 154, ProcessCoord(pipe=0, data=155, model=0): 155, ProcessCoord(pipe=0, data=156, model=0): 156, ProcessCoord(pipe=0, data=157, model=0): 157, ProcessCoord(pipe=0, data=158, model=0): 158, ProcessCoord(pipe=0, data=159, model=0): 159, ProcessCoor 0: d(pipe=0, data=160, model=0): 160, ProcessCoord(pipe=0, data=161, model=0): 161, ProcessCoord(pipe=0, data=162, model=0): 162, ProcessCoord(pipe=0, data=163, model=0): 163, ProcessCoord(pipe=0, data=164, model=0): 164, ProcessCoord(pipe=0, data=165, model=0): 165, ProcessCoord(pipe=0, data=166, model=0): 166, ProcessCoord(pipe=0, data=167, model=0): 167, ProcessCoord(pipe=0, data=168, model=0): 168, ProcessCoord(pipe=0, data=169, model=0): 169, ProcessCoord(pipe=0, data=170, model=0): 170, ProcessCoord(pipe=0, data=171, model=0): 171, ProcessCoord(pipe=0, data=172, model=0): 172, ProcessCoord(pipe=0, data=173, model=0): 173, ProcessCoord(pipe=0, data=174, model=0): 174, ProcessCoord(pipe=0, data=175, model=0): 175, ProcessCoord(pipe=0, data=176, model=0): 176, ProcessCoord(pipe=0, data=177, model=0): 177, ProcessCoord(pipe=0, data=178, model=0): 178, ProcessCoord(pipe=0, data=179, model=0): 179, ProcessCoord(pipe=0, data=180, model=0): 180, ProcessCoord(pipe=0, data=181, model=0): 181, ProcessCoord(pipe=0, da 0: ta=182, model=0): 182, ProcessCoord(pipe=0, data=183, model=0): 183, ProcessCoord(pipe=0, data=184, model=0): 184, ProcessCoord(pipe=0, data=185, model=0): 185, ProcessCoord(pipe=0, data=186, model=0): 186, ProcessCoord(pipe=0, data=187, model=0): 187, ProcessCoord(pipe=0, data=188, model=0): 188, ProcessCoord(pipe=0, data=189, model=0): 189, ProcessCoord(pipe=0, data=190, model=0): 190, ProcessCoord(pipe=0, data=191, model=0): 191, ProcessCoord(pipe=0, data=192, model=0): 192, ProcessCoord(pipe=0, data=193, model=0): 193, ProcessCoord(pipe=0, data=194, model=0): 194, ProcessCoord(pipe=0, data=195, model=0): 195, ProcessCoord(pipe=0, data=196, model=0): 196, ProcessCoord(pipe=0, data=197, model=0): 197, ProcessCoord(pipe=0, data=198, model=0): 198, ProcessCoord(pipe=0, data=199, model=0): 199, ProcessCoord(pipe=0, data=200, model=0): 200, ProcessCoord(pipe=0, data=201, model=0): 201, ProcessCoord(pipe=0, data=202, model=0): 202, ProcessCoord(pipe=0, data=203, model=0): 203, ProcessCoord(pipe=0, data=204, mode 0: l=0): 204, ProcessCoord(pipe=0, data=205, model=0): 205, ProcessCoord(pipe=0, data=206, model=0): 206, ProcessCoord(pipe=0, data=207, model=0): 207, ProcessCoord(pipe=0, data=208, model=0): 208, ProcessCoord(pipe=0, data=209, model=0): 209, ProcessCoord(pipe=0, data=210, model=0): 210, ProcessCoord(pipe=0, data=211, model=0): 211, ProcessCoord(pipe=0, data=212, model=0): 212, ProcessCoord(pipe=0, data=213, model=0): 213, ProcessCoord(pipe=0, data=214, model=0): 214, ProcessCoord(pipe=0, data=215, model=0): 215, ProcessCoord(pipe=0, data=216, model=0): 216, ProcessCoord(pipe=0, data=217, model=0): 217, ProcessCoord(pipe=0, data=218, model=0): 218, ProcessCoord(pipe=0, data=219, model=0): 219, ProcessCoord(pipe=0, data=220, model=0): 220, ProcessCoord(pipe=0, data=221, model=0): 221, ProcessCoord(pipe=0, data=222, model=0): 222, ProcessCoord(pipe=0, data=223, model=0): 223, ProcessCoord(pipe=0, data=224, model=0): 224, ProcessCoord(pipe=0, data=225, model=0): 225, ProcessCoord(pipe=0, data=226, model=0): 226, P 0: rocessCoord(pipe=0, data=227, model=0): 227, ProcessCoord(pipe=0, data=228, model=0): 228, ProcessCoord(pipe=0, data=229, model=0): 229, ProcessCoord(pipe=0, data=230, model=0): 230, ProcessCoord(pipe=0, data=231, model=0): 231, ProcessCoord(pipe=0, data=232, model=0): 232, ProcessCoord(pipe=0, data=233, model=0): 233, ProcessCoord(pipe=0, data=234, model=0): 234, ProcessCoord(pipe=0, data=235, model=0): 235, ProcessCoord(pipe=0, data=236, model=0): 236, ProcessCoord(pipe=0, data=237, model=0): 237, ProcessCoord(pipe=0, data=238, model=0): 238, ProcessCoord(pipe=0, data=239, model=0): 239, ProcessCoord(pipe=0, data=240, model=0): 240, ProcessCoord(pipe=0, data=241, model=0): 241, ProcessCoord(pipe=0, data=242, model=0): 242, ProcessCoord(pipe=0, data=243, model=0): 243, ProcessCoord(pipe=0, data=244, model=0): 244, ProcessCoord(pipe=0, data=245, model=0): 245, ProcessCoord(pipe=0, data=246, model=0): 246, ProcessCoord(pipe=0, data=247, model=0): 247, ProcessCoord(pipe=0, data=248, model=0): 248, ProcessCoord( 0: pipe=0, data=249, model=0): 249, ProcessCoord(pipe=0, data=250, model=0): 250, ProcessCoord(pipe=0, data=251, model=0): 251, ProcessCoord(pipe=0, data=252, model=0): 252, ProcessCoord(pipe=0, data=253, model=0): 253, ProcessCoord(pipe=0, data=254, model=0): 254, ProcessCoord(pipe=0, data=255, model=0): 255, ProcessCoord(pipe=1, data=0, model=0): 256, ProcessCoord(pipe=1, data=1, model=0): 257, ProcessCoord(pipe=1, data=2, model=0): 258, ProcessCoord(pipe=1, data=3, model=0): 259, ProcessCoord(pipe=1, data=4, model=0): 260, ProcessCoord(pipe=1, data=5, model=0): 261, ProcessCoord(pipe=1, data=6, model=0): 262, ProcessCoord(pipe=1, data=7, model=0): 263, ProcessCoord(pipe=1, data=8, model=0): 264, ProcessCoord(pipe=1, data=9, model=0): 265, ProcessCoord(pipe=1, data=10, model=0): 266, ProcessCoord(pipe=1, data=11, model=0): 267, ProcessCoord(pipe=1, data=12, model=0): 268, ProcessCoord(pipe=1, data=13, model=0): 269, ProcessCoord(pipe=1, data=14, model=0): 270, ProcessCoord(pipe=1, data=15, model=0): 271, Proce 0: ssCoord(pipe=1, data=16, model=0): 272, ProcessCoord(pipe=1, data=17, model=0): 273, ProcessCoord(pipe=1, data=18, model=0): 274, ProcessCoord(pipe=1, data=19, model=0): 275, ProcessCoord(pipe=1, data=20, model=0): 276, ProcessCoord(pipe=1, data=21, model=0): 277, ProcessCoord(pipe=1, data=22, model=0): 278, ProcessCoord(pipe=1, data=23, model=0): 279, ProcessCoord(pipe=1, data=24, model=0): 280, ProcessCoord(pipe=1, data=25, model=0): 281, ProcessCoord(pipe=1, data=26, model=0): 282, ProcessCoord(pipe=1, data=27, model=0): 283, ProcessCoord(pipe=1, data=28, model=0): 284, ProcessCoord(pipe=1, data=29, model=0): 285, ProcessCoord(pipe=1, data=30, model=0): 286, ProcessCoord(pipe=1, data=31, model=0): 287, ProcessCoord(pipe=1, data=32, model=0): 288, ProcessCoord(pipe=1, data=33, model=0): 289, ProcessCoord(pipe=1, data=34, model=0): 290, ProcessCoord(pipe=1, data=35, model=0): 291, ProcessCoord(pipe=1, data=36, model=0): 292, ProcessCoord(pipe=1, data=37, model=0): 293, ProcessCoord(pipe=1, data=38, model=0): 0: 294, ProcessCoord(pipe=1, data=39, model=0): 295, ProcessCoord(pipe=1, data=40, model=0): 296, ProcessCoord(pipe=1, data=41, model=0): 297, ProcessCoord(pipe=1, data=42, model=0): 298, ProcessCoord(pipe=1, data=43, model=0): 299, ProcessCoord(pipe=1, data=44, model=0): 300, ProcessCoord(pipe=1, data=45, model=0): 301, ProcessCoord(pipe=1, data=46, model=0): 302, ProcessCoord(pipe=1, data=47, model=0): 303, ProcessCoord(pipe=1, data=48, model=0): 304, ProcessCoord(pipe=1, data=49, model=0): 305, ProcessCoord(pipe=1, data=50, model=0): 306, ProcessCoord(pipe=1, data=51, model=0): 307, ProcessCoord(pipe=1, data=52, model=0): 308, ProcessCoord(pipe=1, data=53, model=0): 309, ProcessCoord(pipe=1, data=54, model=0): 310, ProcessCoord(pipe=1, data=55, model=0): 311, ProcessCoord(pipe=1, data=56, model=0): 312, ProcessCoord(pipe=1, data=57, model=0): 313, ProcessCoord(pipe=1, data=58, model=0): 314, ProcessCoord(pipe=1, data=59, model=0): 315, ProcessCoord(pipe=1, data=60, model=0): 316, ProcessCoord(pipe=1, data=61 0: , model=0): 317, ProcessCoord(pipe=1, data=62, model=0): 318, ProcessCoord(pipe=1, data=63, model=0): 319, ProcessCoord(pipe=1, data=64, model=0): 320, ProcessCoord(pipe=1, data=65, model=0): 321, ProcessCoord(pipe=1, data=66, model=0): 322, ProcessCoord(pipe=1, data=67, model=0): 323, ProcessCoord(pipe=1, data=68, model=0): 324, ProcessCoord(pipe=1, data=69, model=0): 325, ProcessCoord(pipe=1, data=70, model=0): 326, ProcessCoord(pipe=1, data=71, model=0): 327, ProcessCoord(pipe=1, data=72, model=0): 328, ProcessCoord(pipe=1, data=73, model=0): 329, ProcessCoord(pipe=1, data=74, model=0): 330, ProcessCoord(pipe=1, data=75, model=0): 331, ProcessCoord(pipe=1, data=76, model=0): 332, ProcessCoord(pipe=1, data=77, model=0): 333, ProcessCoord(pipe=1, data=78, model=0): 334, ProcessCoord(pipe=1, data=79, model=0): 335, ProcessCoord(pipe=1, data=80, model=0): 336, ProcessCoord(pipe=1, data=81, model=0): 337, ProcessCoord(pipe=1, data=82, model=0): 338, ProcessCoord(pipe=1, data=83, model=0): 339, ProcessCoord(pipe 0: =1, data=84, model=0): 340, ProcessCoord(pipe=1, data=85, model=0): 341, ProcessCoord(pipe=1, data=86, model=0): 342, ProcessCoord(pipe=1, data=87, model=0): 343, ProcessCoord(pipe=1, data=88, model=0): 344, ProcessCoord(pipe=1, data=89, model=0): 345, ProcessCoord(pipe=1, data=90, model=0): 346, ProcessCoord(pipe=1, data=91, model=0): 347, ProcessCoord(pipe=1, data=92, model=0): 348, ProcessCoord(pipe=1, data=93, model=0): 349, ProcessCoord(pipe=1, data=94, model=0): 350, ProcessCoord(pipe=1, data=95, model=0): 351, ProcessCoord(pipe=1, data=96, model=0): 352, ProcessCoord(pipe=1, data=97, model=0): 353, ProcessCoord(pipe=1, data=98, model=0): 354, ProcessCoord(pipe=1, data=99, model=0): 355, ProcessCoord(pipe=1, data=100, model=0): 356, ProcessCoord(pipe=1, data=101, model=0): 357, ProcessCoord(pipe=1, data=102, model=0): 358, ProcessCoord(pipe=1, data=103, model=0): 359, ProcessCoord(pipe=1, data=104, model=0): 360, ProcessCoord(pipe=1, data=105, model=0): 361, ProcessCoord(pipe=1, data=106, model=0): 362, 0: ProcessCoord(pipe=1, data=107, model=0): 363, ProcessCoord(pipe=1, data=108, model=0): 364, ProcessCoord(pipe=1, data=109, model=0): 365, ProcessCoord(pipe=1, data=110, model=0): 366, ProcessCoord(pipe=1, data=111, model=0): 367, ProcessCoord(pipe=1, data=112, model=0): 368, ProcessCoord(pipe=1, data=113, model=0): 369, ProcessCoord(pipe=1, data=114, model=0): 370, ProcessCoord(pipe=1, data=115, model=0): 371, ProcessCoord(pipe=1, data=116, model=0): 372, ProcessCoord(pipe=1, data=117, model=0): 373, ProcessCoord(pipe=1, data=118, model=0): 374, ProcessCoord(pipe=1, data=119, model=0): 375, ProcessCoord(pipe=1, data=120, model=0): 376, ProcessCoord(pipe=1, data=121, model=0): 377, ProcessCoord(pipe=1, data=122, model=0): 378, ProcessCoord(pipe=1, data=123, model=0): 379, ProcessCoord(pipe=1, data=124, model=0): 380, ProcessCoord(pipe=1, data=125, model=0): 381, ProcessCoord(pipe=1, data=126, model=0): 382, ProcessCoord(pipe=1, data=127, model=0): 383, ProcessCoord(pipe=1, data=128, model=0): 384, ProcessCoor 0: d(pipe=1, data=129, model=0): 385, ProcessCoord(pipe=1, data=130, model=0): 386, ProcessCoord(pipe=1, data=131, model=0): 387, ProcessCoord(pipe=1, data=132, model=0): 388, ProcessCoord(pipe=1, data=133, model=0): 389, ProcessCoord(pipe=1, data=134, model=0): 390, ProcessCoord(pipe=1, data=135, model=0): 391, ProcessCoord(pipe=1, data=136, model=0): 392, ProcessCoord(pipe=1, data=137, model=0): 393, ProcessCoord(pipe=1, data=138, model=0): 394, ProcessCoord(pipe=1, data=139, model=0): 395, ProcessCoord(pipe=1, data=140, model=0): 396, ProcessCoord(pipe=1, data=141, model=0): 397, ProcessCoord(pipe=1, data=142, model=0): 398, ProcessCoord(pipe=1, data=143, model=0): 399, ProcessCoord(pipe=1, data=144, model=0): 400, ProcessCoord(pipe=1, data=145, model=0): 401, ProcessCoord(pipe=1, data=146, model=0): 402, ProcessCoord(pipe=1, data=147, model=0): 403, ProcessCoord(pipe=1, data=148, model=0): 404, ProcessCoord(pipe=1, data=149, model=0): 405, ProcessCoord(pipe=1, data=150, model=0): 406, ProcessCoord(pipe=1, da 0: ta=151, model=0): 407, ProcessCoord(pipe=1, data=152, model=0): 408, ProcessCoord(pipe=1, data=153, model=0): 409, ProcessCoord(pipe=1, data=154, model=0): 410, ProcessCoord(pipe=1, data=155, model=0): 411, ProcessCoord(pipe=1, data=156, model=0): 412, ProcessCoord(pipe=1, data=157, model=0): 413, ProcessCoord(pipe=1, data=158, model=0): 414, ProcessCoord(pipe=1, data=159, model=0): 415, ProcessCoord(pipe=1, data=160, model=0): 416, ProcessCoord(pipe=1, data=161, model=0): 417, ProcessCoord(pipe=1, data=162, model=0): 418, ProcessCoord(pipe=1, data=163, model=0): 419, ProcessCoord(pipe=1, data=164, model=0): 420, ProcessCoord(pipe=1, data=165, model=0): 421, ProcessCoord(pipe=1, data=166, model=0): 422, ProcessCoord(pipe=1, data=167, model=0): 423, ProcessCoord(pipe=1, data=168, model=0): 424, ProcessCoord(pipe=1, data=169, model=0): 425, ProcessCoord(pipe=1, data=170, model=0): 426, ProcessCoord(pipe=1, data=171, model=0): 427, ProcessCoord(pipe=1, data=172, model=0): 428, ProcessCoord(pipe=1, data=173, mode 0: l=0): 429, ProcessCoord(pipe=1, data=174, model=0): 430, ProcessCoord(pipe=1, data=175, model=0): 431, ProcessCoord(pipe=1, data=176, model=0): 432, ProcessCoord(pipe=1, data=177, model=0): 433, ProcessCoord(pipe=1, data=178, model=0): 434, ProcessCoord(pipe=1, data=179, model=0): 435, ProcessCoord(pipe=1, data=180, model=0): 436, ProcessCoord(pipe=1, data=181, model=0): 437, ProcessCoord(pipe=1, data=182, model=0): 438, ProcessCoord(pipe=1, data=183, model=0): 439, ProcessCoord(pipe=1, data=184, model=0): 440, ProcessCoord(pipe=1, data=185, model=0): 441, ProcessCoord(pipe=1, data=186, model=0): 442, ProcessCoord(pipe=1, data=187, model=0): 443, ProcessCoord(pipe=1, data=188, model=0): 444, ProcessCoord(pipe=1, data=189, model=0): 445, ProcessCoord(pipe=1, data=190, model=0): 446, ProcessCoord(pipe=1, data=191, model=0): 447, ProcessCoord(pipe=1, data=192, model=0): 448, ProcessCoord(pipe=1, data=193, model=0): 449, ProcessCoord(pipe=1, data=194, model=0): 450, ProcessCoord(pipe=1, data=195, model=0): 451, P 0: rocessCoord(pipe=1, data=196, model=0): 452, ProcessCoord(pipe=1, data=197, model=0): 453, ProcessCoord(pipe=1, data=198, model=0): 454, ProcessCoord(pipe=1, data=199, model=0): 455, ProcessCoord(pipe=1, data=200, model=0): 456, ProcessCoord(pipe=1, data=201, model=0): 457, ProcessCoord(pipe=1, data=202, model=0): 458, ProcessCoord(pipe=1, data=203, model=0): 459, ProcessCoord(pipe=1, data=204, model=0): 460, ProcessCoord(pipe=1, data=205, model=0): 461, ProcessCoord(pipe=1, data=206, model=0): 462, ProcessCoord(pipe=1, data=207, model=0): 463, ProcessCoord(pipe=1, data=208, model=0): 464, ProcessCoord(pipe=1, data=209, model=0): 465, ProcessCoord(pipe=1, data=210, model=0): 466, ProcessCoord(pipe=1, data=211, model=0): 467, ProcessCoord(pipe=1, data=212, model=0): 468, ProcessCoord(pipe=1, data=213, model=0): 469, ProcessCoord(pipe=1, data=214, model=0): 470, ProcessCoord(pipe=1, data=215, model=0): 471, ProcessCoord(pipe=1, data=216, model=0): 472, ProcessCoord(pipe=1, data=217, model=0): 473, ProcessCoord( 0: pipe=1, data=218, model=0): 474, ProcessCoord(pipe=1, data=219, model=0): 475, ProcessCoord(pipe=1, data=220, model=0): 476, ProcessCoord(pipe=1, data=221, model=0): 477, ProcessCoord(pipe=1, data=222, model=0): 478, ProcessCoord(pipe=1, data=223, model=0): 479, ProcessCoord(pipe=1, data=224, model=0): 480, ProcessCoord(pipe=1, data=225, model=0): 481, ProcessCoord(pipe=1, data=226, model=0): 482, ProcessCoord(pipe=1, data=227, model=0): 483, ProcessCoord(pipe=1, data=228, model=0): 484, ProcessCoord(pipe=1, data=229, model=0): 485, ProcessCoord(pipe=1, data=230, model=0): 486, ProcessCoord(pipe=1, data=231, model=0): 487, ProcessCoord(pipe=1, data=232, model=0): 488, ProcessCoord(pipe=1, data=233, model=0): 489, ProcessCoord(pipe=1, data=234, model=0): 490, ProcessCoord(pipe=1, data=235, model=0): 491, ProcessCoord(pipe=1, data=236, model=0): 492, ProcessCoord(pipe=1, data=237, model=0): 493, ProcessCoord(pipe=1, data=238, model=0): 494, ProcessCoord(pipe=1, data=239, model=0): 495, ProcessCoord(pipe=1, data 0: =240, model=0): 496, ProcessCoord(pipe=1, data=241, model=0): 497, ProcessCoord(pipe=1, data=242, model=0): 498, ProcessCoord(pipe=1, data=243, model=0): 499, ProcessCoord(pipe=1, data=244, model=0): 500, ProcessCoord(pipe=1, data=245, model=0): 501, ProcessCoord(pipe=1, data=246, model=0): 502, ProcessCoord(pipe=1, data=247, model=0): 503, ProcessCoord(pipe=1, data=248, model=0): 504, ProcessCoord(pipe=1, data=249, model=0): 505, ProcessCoord(pipe=1, data=250, model=0): 506, ProcessCoord(pipe=1, data=251, model=0): 507, ProcessCoord(pipe=1, data=252, model=0): 508, ProcessCoord(pipe=1, data=253, model=0): 509, ProcessCoord(pipe=1, data=254, model=0): 510, ProcessCoord(pipe=1, data=255, model=0): 511} 0: [2022-11-25 17:01:11,749] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer 0: stage=0 layers=21 0: 0: _to_float16 0: 1: EmbeddingPipe 0: 2: 0: 3: ParallelTransformerLayerPipe 0: 4: ParallelTransformerLayerPipe 0: 5: ParallelTransformerLayerPipe 0: 6: ParallelTransformerLayerPipe 0: 7: ParallelTransformerLayerPipe 0: 8: ParallelTransformerLayerPipe 0: 9: ParallelTransformerLayerPipe 0: 10: ParallelTransformerLayerPipe 0: 11: ParallelTransformerLayerPipe 0: 12: ParallelTransformerLayerPipe 0: 13: ParallelTransformerLayerPipe 0: 14: ParallelTransformerLayerPipe 0: 15: ParallelTransformerLayerPipe 0: 16: ParallelTransformerLayerPipe 0: 17: ParallelTransformerLayerPipe 0: 18: ParallelTransformerLayerPipe 0: 19: ParallelTransformerLayerPipe 0: 20: ParallelTransformerLayerPipe 0: stage=1 layers=22 0: 21: ParallelTransformerLayerPipe 0: 22: ParallelTransformerLayerPipe 0: 23: ParallelTransformerLayerPipe 0: 24: ParallelTransformerLayerPipe 0: 25: ParallelTransformerLayerPipe 0: 26: ParallelTransformerLayerPipe 0: 27: ParallelTransformerLayerPipe 0: 28: ParallelTransformerLayerPipe 0: 29: ParallelTransformerLayerPipe 0: 30: ParallelTransformerLayerPipe 0: 31: ParallelTransformerLayerPipe 0: 32: ParallelTransformerLayerPipe 0: 33: ParallelTransformerLayerPipe 0: 34: ParallelTransformerLayerPipe 0: 35: ParallelTransformerLayerPipe 0: 36: ParallelTransformerLayerPipe 0: 37: ParallelTransformerLayerPipe 0: 38: ParallelTransformerLayerPipe 0: 39: undo 0: 40: MixedFusedLayerNorm 0: 41: EmbeddingPipe 0: 42: float16_to_fp32 0: loss: CrossEntropy 0: [2022-11-25 17:01:16,749] [INFO] [utils.py:827:see_memory_usage] After Building Model 0: [2022-11-25 17:01:16,749] [INFO] [utils.py:828:see_memory_usage] MA 3.8 GB Max_MA 3.8 GB CA 3.89 GB Max_CA 4 GB 0: [2022-11-25 17:01:16,750] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 36.52 GB, percent = 7.3% 0: setting training iterations to 24424 0: > learning rate decay style: cosine 0: DeepSpeed is enabled. 0: [2022-11-25 17:01:16,751] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown 0: [2022-11-25 17:01:35,356] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False 0: [2022-11-25 17:01:35,357] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer 0: [2022-11-25 17:01:35,357] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer 0: [2022-11-25 17:01:35,363] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam 0: [2022-11-25 17:01:35,363] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer 0: [2022-11-25 17:01:35,402] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer 0: [2022-11-25 17:01:35,402] [INFO] [utils.py:828:see_memory_usage] MA 3.78 GB Max_MA 3.81 GB CA 3.9 GB Max_CA 4 GB 0: [2022-11-25 17:01:35,402] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.36 GB, percent = 7.4% 4: ninja: no work to do. 4: Time to load utils op: 0.17975997924804688 seconds 0: Time to load utils op: 0.10273575782775879 seconds 0: [2022-11-25 17:01:35,543] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 0: [2022-11-25 17:01:35,544] [INFO] [utils.py:828:see_memory_usage] MA 3.78 GB Max_MA 3.78 GB CA 3.9 GB Max_CA 4 GB 0: [2022-11-25 17:01:35,544] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.36 GB, percent = 7.4% 1: ninja: no work to do. 1: Time to load utils op: 0.13853740692138672 seconds 29: Time to load utils op: 0.10260963439941406 seconds 29: Time to load utils op: 0.10258817672729492 seconds 29: Time to load utils op: 0.10318231582641602 seconds 29: Time to load utils op: 0.1027674674987793 seconds 29: Time to load utils op: 0.10284066200256348 seconds 29: Time to load utils op: 0.10290884971618652 seconds 29: Time to load utils op: 0.10288262367248535 seconds 29: Time to load utils op: 0.10300612449645996 seconds 30: Time to load utils op: 0.1045675277709961 seconds 30: Time to load utils op: 0.10468173027038574 seconds 31: Time to load utils op: 0.10437822341918945 seconds 30: Time to load utils op: 0.10469460487365723 seconds 30: Time to load utils op: 0.10454463958740234 seconds 30: Time to load utils op: 0.10474562644958496 seconds 31: Time to load utils op: 0.10475635528564453 secondsTime to load utils op: 0.1037905216217041 seconds 31: 30: Time to load utils op: 0.10484719276428223 seconds 30: Time to load utils op: 0.10502481460571289 seconds 31: Time to load utils op: 0.10446310043334961 secondsTime to load utils op: 0.10486531257629395 seconds 31: 31: Time to load utils op: 0.10388994216918945 seconds 30: Time to load utils op: 0.10538220405578613 seconds 31: Time to load utils op: 0.10452699661254883 seconds 31: Time to load utils op: 0.1052083969116211 seconds 0: [2022-11-25 17:01:35,710] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 0: [2022-11-25 17:01:35,710] [INFO] [utils.py:828:see_memory_usage] MA 7.73 GB Max_MA 7.73 GB CA 9.8 GB Max_CA 10 GB 0: [2022-11-25 17:01:35,710] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.36 GB, percent = 7.4% 33: Time to load utils op: 0.10236334800720215 seconds 35: Time to load utils op: 0.10284829139709473 seconds 37: Time to load utils op: 0.10270333290100098 seconds 1: Time to load utils op: 0.20344185829162598 seconds 1: Time to load utils op: 0.20246410369873047 seconds 1: Time to load utils op: 0.2033369541168213 seconds 1: Time to load utils op: 0.20322108268737793 seconds 1: Time to load utils op: 0.20314359664916992 seconds 1: Time to load utils op: 0.20330238342285156 secondsTime to load utils op: 0.20324134826660156 seconds 1: 0: Time to load utils op: 0.20435142517089844 seconds 0: Time to load utils op: 0.2045738697052002 seconds 0: Time to load utils op: 0.204848051071167 seconds 0: Time to load utils op: 0.20492196083068848 seconds 0: Time to load utils op: 0.20502328872680664 seconds 0: Time to load utils op: 0.2050919532775879 seconds 0: Time to load utils op: 0.20536327362060547 seconds 1: Time to load utils op: 0.0006062984466552734 seconds 2: Time to load utils op: 0.20374751091003418 seconds 2: Time to load utils op: 0.20398259162902832 seconds 2: Time to load utils op: 0.20364785194396973 secondsTime to load utils op: 0.20366477966308594 seconds 2: 2: Time to load utils op: 0.20388364791870117 seconds 2: Time to load utils op: 0.20430898666381836 seconds 3: Time to load utils op: 0.20325136184692383 secondsTime to load utils op: 0.2026689052581787 seconds 3: 2: Time to load utils op: 0.20392322540283203 seconds 2: Time to load utils op: 0.20400142669677734 seconds 3: Time to load utils op: 0.20263075828552246 seconds 3: Time to load utils op: 0.2027890682220459 seconds 3: Time to load utils op: 0.2028369903564453 seconds 3: Time to load utils op: 0.2029256820678711 seconds 3: Time to load utils op: 0.20322442054748535 seconds 4: Time to load utils op: 0.0006208419799804688 seconds 3: Time to load utils op: 0.20327973365783691 seconds 4: Time to load utils op: 0.20388460159301758 seconds 4: Time to load utils op: 0.20370841026306152 seconds 43: Time to load utils op: 0.10307741165161133 seconds 4: Time to load utils op: 0.20466160774230957 seconds 4: Time to load utils op: 0.2035987377166748 seconds 4: Time to load utils op: 0.20354366302490234 seconds 4: Time to load utils op: 0.20368337631225586 secondsTime to load utils op: 0.20464038848876953 seconds 4: 6: Time to load utils op: 0.20387840270996094 seconds 6: Time to load utils op: 0.2032625675201416 secondsTime to load utils op: 0.20342350006103516 seconds 6: 6: Time to load utils op: 0.20329499244689941 seconds 6: Time to load utils op: 0.20350003242492676 seconds 6: Time to load utils op: 0.2036428451538086 seconds 6: Time to load utils op: 0.2039508819580078 seconds 6: Time to load utils op: 0.20361900329589844 seconds 10: Time to load utils op: 0.4036369323730469 seconds 5: Time to load utils op: 0.2045285701751709 seconds 9: Time to load utils op: 0.4036374092102051 seconds 7: Time to load utils op: 0.20291662216186523 seconds 7: Time to load utils op: 0.20279955863952637 seconds 5: Time to load utils op: 0.2051868438720703 seconds 7: Time to load utils op: 0.20378971099853516 secondsTime to load utils op: 0.20350146293640137 seconds 7: 7: Time to load utils op: 0.2029261589050293 seconds 5: Time to load utils op: 0.20497655868530273 seconds 5: Time to load utils op: 0.2051858901977539 secondsTime to load utils op: 0.20503973960876465 seconds 5: 7: Time to load utils op: 0.2032778263092041 seconds 7: Time to load utils op: 0.2037801742553711 seconds 5: Time to load utils op: 0.2051231861114502 seconds 7: Time to load utils op: 0.2039351463317871 seconds 5: Time to load utils op: 0.20521831512451172 seconds 5: Time to load utils op: 0.2053670883178711 seconds 8: Time to load utils op: 0.20374798774719238 seconds 8: Time to load utils op: 0.20384430885314941 secondsTime to load utils op: 0.20264506340026855 seconds 8: 8: Time to load utils op: 0.20276308059692383 seconds 8: Time to load utils op: 0.20272445678710938 seconds 8: Time to load utils op: 0.20410466194152832 seconds 49: Time to load utils op: 0.10254454612731934 seconds 8: Time to load utils op: 0.2031559944152832 seconds 9: Time to load utils op: 0.2024974822998047 seconds 9: Time to load utils op: 0.20258331298828125 seconds 9: Time to load utils op: 0.2024378776550293 seconds 9: Time to load utils op: 0.20276212692260742 seconds 8: Time to load utils op: 0.203416109085083 seconds 9: Time to load utils op: 0.203033447265625 seconds 9: Time to load utils op: 0.2025165557861328 seconds 9: Time to load utils op: 0.2031245231628418 seconds 10: Time to load utils op: 0.20247268676757812 seconds 10: Time to load utils op: 0.20215439796447754 seconds 10: Time to load utils op: 0.20246338844299316 seconds 10: Time to load utils op: 0.20200324058532715 seconds 10: Time to load utils op: 0.20284295082092285 seconds 10: Time to load utils op: 0.2022247314453125 seconds 13: Time to load utils op: 0.4035358428955078 seconds 10: Time to load utils op: 0.20246362686157227 seconds 11: Time to load utils op: 0.2037670612335205 seconds 11: Time to load utils op: 0.20283722877502441 seconds 11: Time to load utils op: 0.20317673683166504 secondsTime to load utils op: 0.203094482421875 seconds 11: 11: Time to load utils op: 0.2042250633239746 seconds 12: Time to load utils op: 0.20322370529174805 seconds 1: Time to load utils op: 0.0004458427429199219 seconds 1: Time to load utils op: 0.00039839744567871094 seconds 1: Time to load utils op: 0.0003247261047363281 seconds 1: Time to load utils op: 0.00036978721618652344 seconds 1: Time to load utils op: 0.00033736228942871094 seconds 1: Time to load utils op: 0.0003781318664550781 seconds 1: Time to load utils op: 0.00030732154846191406 seconds 0: Time to load utils op: 0.0005710124969482422 seconds 0: Time to load utils op: 0.0007627010345458984 seconds 0: Time to load utils op: 0.0007278919219970703 seconds 0: Time to load utils op: 0.0005631446838378906 seconds 0: Time to load utils op: 0.0005962848663330078 seconds 0: Time to load utils op: 0.0004956722259521484 seconds 0: Time to load utils op: 0.0006501674652099609 seconds 2: Time to load utils op: 0.0004673004150390625 seconds 4: Time to load utils op: 0.0004811286926269531 seconds 2: Time to load utils op: 0.0005598068237304688 seconds 4: Time to load utils op: 0.00038743019104003906 seconds 4: Time to load utils op: 0.0006356239318847656 seconds 3: Time to load utils op: 0.0007107257843017578 seconds 4: Time to load utils op: 0.00045561790466308594 seconds 2: Time to load utils op: 0.0006392002105712891 seconds 3: Time to load utils op: 0.0003333091735839844 seconds 2: Time to load utils op: 0.0005831718444824219 seconds 3: Time to load utils op: 0.0006577968597412109 seconds 4: Time to load utils op: 0.0004930496215820312 seconds 2: Time to load utils op: 0.0005106925964355469 seconds 3: Time to load utils op: 0.0005087852478027344 seconds 2: Time to load utils op: 0.0004863739013671875 seconds 3: Time to load utils op: 0.00044345855712890625 seconds 2: Time to load utils op: 0.0004565715789794922 seconds 4: Time to load utils op: 0.00035119056701660156 seconds 4: Time to load utils op: 0.0004115104675292969 seconds 2: Time to load utils op: 0.0003342628479003906 seconds 3: Time to load utils op: 0.0003905296325683594 seconds 3: Time to load utils op: 0.0003299713134765625 seconds 3: Time to load utils op: 0.0003724098205566406 seconds 9: Time to load utils op: 0.0006527900695800781 seconds 10: Time to load utils op: 0.0005860328674316406 seconds 6: Time to load utils op: 0.0004978179931640625 seconds 0: [2022-11-25 17:01:35,755] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 6: Time to load utils op: 0.0007791519165039062 seconds 0: [2022-11-25 17:01:35,756] [INFO] [utils.py:828:see_memory_usage] MA 7.73 GB Max_MA 7.73 GB CA 9.8 GB Max_CA 10 GB 6: Time to load utils op: 0.0007681846618652344 seconds 0: [2022-11-25 17:01:35,756] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.38 GB, percent = 7.4% 6: Time to load utils op: 0.0005586147308349609 seconds 7: Time to load utils op: 0.0006604194641113281 seconds 6: Time to load utils op: 0.0005986690521240234 seconds 7: Time to load utils op: 0.0007598400115966797 seconds 6: Time to load utils op: 0.0007493495941162109 seconds 9: Time to load utils op: 0.0004696846008300781 seconds 6: Time to load utils op: 0.0004658699035644531 seconds 7: Time to load utils op: 0.0006151199340820312 seconds 9: Time to load utils op: 0.0004703998565673828 seconds 9: Time to load utils op: 0.00047087669372558594 seconds 6: Time to load utils op: 0.00048732757568359375 seconds 7: Time to load utils op: 0.0005028247833251953 seconds 8: Time to load utils op: 0.0005578994750976562 seconds 10: Time to load utils op: 0.0005059242248535156 seconds 9: Time to load utils op: 0.00041103363037109375 seconds 5: Time to load utils op: 0.0005137920379638672 seconds 10: Time to load utils op: 0.00039076805114746094 seconds 13: Time to load utils op: 0.0004248619079589844 seconds 7: Time to load utils op: 0.0003871917724609375 seconds 7: Time to load utils op: 0.0003821849822998047 seconds 9: Time to load utils op: 0.0004456043243408203 seconds 9: Time to load utils op: 0.0005192756652832031 seconds 7: Time to load utils op: 0.0005488395690917969 seconds 5: Time to load utils op: 0.0005209445953369141 seconds 9: Time to load utils op: 0.0003905296325683594 seconds 5: Time to load utils op: 0.0003559589385986328 seconds 8: Time to load utils op: 0.0006492137908935547 seconds 8: Time to load utils op: 0.0006198883056640625 seconds 5: Time to load utils op: 0.0005488395690917969 seconds 10: Time to load utils op: 0.0005555152893066406 seconds 7: Time to load utils op: 0.0007975101470947266 seconds 10: Time to load utils op: 0.0003275871276855469 seconds 5: Time to load utils op: 0.0005862712860107422 seconds 5: Time to load utils op: 0.0006701946258544922 seconds 10: Time to load utils op: 0.0003838539123535156 seconds 5: Time to load utils op: 0.0004620552062988281 seconds 8: Time to load utils op: 0.0006511211395263672 seconds 5: Time to load utils op: 0.00038933753967285156 seconds 10: Time to load utils op: 0.0005207061767578125 seconds 8: Time to load utils op: 0.0004639625549316406 secondsTime to load utils op: 0.00036835670471191406 seconds 8: 10: Time to load utils op: 0.0004837512969970703 seconds 8: Time to load utils op: 0.0004923343658447266 seconds 8: Time to load utils op: 0.0008285045623779297 seconds 11: Time to load utils op: 0.000537872314453125 secondsTime to load utils op: 0.0006005764007568359 seconds 11: 12: Time to load utils op: 0.000640869140625 seconds 11: Time to load utils op: 0.0006906986236572266 seconds 11: Time to load utils op: 0.0005266666412353516 seconds 11: Time to load utils op: 0.0005662441253662109 seconds 29: Time to load utils op: 0.0005352497100830078 seconds 29: Time to load utils op: 0.0004756450653076172 seconds 29: Time to load utils op: 0.00031566619873046875 seconds 29: Time to load utils op: 0.00034499168395996094 seconds 30: Time to load utils op: 0.0004956722259521484 secondsTime to load utils op: 0.0003592967987060547 seconds 30: 29: Time to load utils op: 0.00037980079650878906 seconds 29: Time to load utils op: 0.00035691261291503906 seconds 29: Time to load utils op: 0.0003573894500732422 seconds 30: Time to load utils op: 0.0004010200500488281 seconds 30: Time to load utils op: 0.00043845176696777344 seconds 29: Time to load utils op: 0.0003833770751953125 seconds 30: Time to load utils op: 0.0003147125244140625 seconds 30: Time to load utils op: 0.00048089027404785156 seconds 30: Time to load utils op: 0.0003504753112792969 seconds 31: Time to load utils op: 0.00046062469482421875 seconds 31: Time to load utils op: 0.0004215240478515625 seconds 30: Time to load utils op: 0.00035309791564941406 seconds 31: Time to load utils op: 0.00038361549377441406 seconds 31: Time to load utils op: 0.000347137451171875 seconds 31: Time to load utils op: 0.00036144256591796875 seconds 31: Time to load utils op: 0.0003275871276855469 seconds 31: Time to load utils op: 0.0002989768981933594 seconds 31: Time to load utils op: 0.000396728515625 seconds 0: [2022-11-25 17:01:35,801] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 0: [2022-11-25 17:01:35,801] [INFO] [utils.py:828:see_memory_usage] MA 11.36 GB Max_MA 11.36 GB CA 15.23 GB Max_CA 15 GB 0: [2022-11-25 17:01:35,801] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.39 GB, percent = 7.4% 0: [2022-11-25 17:01:35,833] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 0: [2022-11-25 17:01:35,833] [INFO] [utils.py:828:see_memory_usage] MA 11.36 GB Max_MA 11.36 GB CA 15.23 GB Max_CA 15 GB 0: [2022-11-25 17:01:35,834] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.39 GB, percent = 7.4% 0: [2022-11-25 17:01:35,868] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 0: [2022-11-25 17:01:35,869] [INFO] [utils.py:828:see_memory_usage] MA 11.36 GB Max_MA 11.36 GB CA 15.23 GB Max_CA 15 GB 0: [2022-11-25 17:01:35,869] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.39 GB, percent = 7.4% 0: [2022-11-25 17:01:35,899] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer 0: [2022-11-25 17:01:35,899] [INFO] [utils.py:828:see_memory_usage] MA 11.36 GB Max_MA 11.36 GB CA 15.23 GB Max_CA 15 GB 0: [2022-11-25 17:01:35,899] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.39 GB, percent = 7.4% 32: ninja: no work to do. 33: Time to load utils op: 0.0005762577056884766 seconds 32: Time to load utils op: 0.17734766006469727 seconds 35: Time to load utils op: 0.0004899501800537109 seconds 37: Time to load utils op: 0.00047397613525390625 seconds 43: Time to load utils op: 0.0004813671112060547 seconds 0: [2022-11-25 17:01:35,935] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer 0: [2022-11-25 17:01:35,936] [INFO] [utils.py:828:see_memory_usage] MA 11.42 GB Max_MA 11.42 GB CA 15.23 GB Max_CA 15 GB 0: [2022-11-25 17:01:35,936] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.39 GB, percent = 7.4% 49: Time to load utils op: 0.00045418739318847656 seconds 32: Time to load utils op: 0.0005650520324707031 seconds 11: Time to load utils op: 0.40433239936828613 seconds 32: Time to load utils op: 0.20301270484924316 seconds 32: Time to load utils op: 0.20285677909851074 seconds 11: Time to load utils op: 0.4046025276184082 seconds 32: Time to load utils op: 0.20299506187438965 seconds 11: Time to load utils op: 0.4048001766204834 seconds 32: Time to load utils op: 0.20287537574768066 seconds 32: Time to load utils op: 0.20307326316833496 seconds 32: Time to load utils op: 0.20302128791809082 seconds 12: Time to load utils op: 0.40345311164855957 seconds 12: Time to load utils op: 0.40369534492492676 seconds 13: Time to load utils op: 0.403275728225708 secondsTime to load utils op: 0.4036252498626709 seconds 13: 13: Time to load utils op: 0.403226375579834 seconds 12: Time to load utils op: 0.40470433235168457 seconds 13: Time to load utils op: 0.40291261672973633 seconds 12: Time to load utils op: 0.40488433837890625 seconds 13: Time to load utils op: 0.4028751850128174 seconds 13: Time to load utils op: 0.40265369415283203 seconds 13: Time to load utils op: 0.4030337333679199 seconds 12: Time to load utils op: 0.40418219566345215 secondsTime to load utils op: 0.40452098846435547 seconds 12: 12: Time to load utils op: 0.40497922897338867 seconds 33: Time to load utils op: 0.2034444808959961 seconds 52: Time to load utils op: 0.3035707473754883 seconds 33: Time to load utils op: 0.20321869850158691 seconds 33: Time to load utils op: 0.20389509201049805 seconds 33: Time to load utils op: 0.20323443412780762 seconds 33: Time to load utils op: 0.20389962196350098 seconds 33: Time to load utils op: 0.20363116264343262 secondsTime to load utils op: 0.20337986946105957 seconds 33: 14: Time to load utils op: 0.40381860733032227 secondsTime to load utils op: 0.4050154685974121 seconds 14: 34: Time to load utils op: 0.20300865173339844 seconds 14: Time to load utils op: 0.40496158599853516 seconds 34: Time to load utils op: 0.20290279388427734 seconds 14: Time to load utils op: 0.4044182300567627 secondsTime to load utils op: 0.40389299392700195 seconds 14: 14: Time to load utils op: 0.4040207862854004 seconds 34: Time to load utils op: 0.20279240608215332 seconds 34: Time to load utils op: 0.20273613929748535 seconds 34: Time to load utils op: 0.2027292251586914 seconds 14: Time to load utils op: 0.40469861030578613 seconds 34: Time to load utils op: 0.2027878761291504 seconds 34: Time to load utils op: 0.20281505584716797 seconds 34: Time to load utils op: 0.2030797004699707 seconds 14: Time to load utils op: 0.405043363571167 seconds 15: Time to load utils op: 0.4045412540435791 seconds 15: Time to load utils op: 0.40363240242004395 seconds 15: Time to load utils op: 0.4040846824645996 seconds 15: Time to load utils op: 0.40389370918273926 secondsTime to load utils op: 0.4042789936065674 seconds 15: 35: Time to load utils op: 0.2031693458557129 secondsTime to load utils op: 0.2031099796295166 seconds 35: 35: Time to load utils op: 0.20319223403930664 seconds 15: Time to load utils op: 0.4044339656829834 seconds 35: Time to load utils op: 0.20315098762512207 secondsTime to load utils op: 0.2030630111694336 secondsTime to load utils op: 0.20308136940002441 seconds 35: 35: 15: Time to load utils op: 0.40412306785583496 seconds 35: Time to load utils op: 0.20308184623718262 seconds 15: Time to load utils op: 0.40435028076171875 seconds 19: Time to load utils op: 0.6041409969329834 seconds 37: Time to load utils op: 0.20259833335876465 seconds 37: Time to load utils op: 0.20261478424072266 seconds 37: Time to load utils op: 0.2036597728729248 seconds 36: Time to load utils op: 0.20391106605529785 seconds 36: Time to load utils op: 0.20305252075195312 seconds 37: Time to load utils op: 0.2032303810119629 seconds 36: Time to load utils op: 0.20309877395629883 seconds 37: Time to load utils op: 0.20342063903808594 secondsTime to load utils op: 0.203413724899292 seconds 37: 37: Time to load utils op: 0.20269346237182617 seconds 36: Time to load utils op: 0.20381927490234375 secondsTime to load utils op: 0.2040266990661621 seconds 36: 17: Time to load utils op: 0.40321826934814453 seconds 36: Time to load utils op: 0.20333552360534668 seconds 17: Time to load utils op: 0.40410566329956055 seconds 36: Time to load utils op: 0.20412015914916992 seconds 36: Time to load utils op: 0.20347881317138672 seconds 17: Time to load utils op: 0.40378904342651367 seconds 17: Time to load utils op: 0.4036428928375244 seconds 17: Time to load utils op: 0.4044961929321289 seconds 17: Time to load utils op: 0.4046778678894043 seconds 17: Time to load utils op: 0.403576135635376 seconds 19: Time to load utils op: 0.40241074562072754 seconds 16: Time to load utils op: 0.404801607131958 seconds 17: Time to load utils op: 0.40414905548095703 seconds 19: Time to load utils op: 0.4024381637573242 seconds 18: Time to load utils op: 0.4040701389312744 secondsTime to load utils op: 0.4034438133239746 seconds 18: 16: Time to load utils op: 0.4048144817352295 seconds 16: Time to load utils op: 0.4054219722747803 seconds 18: Time to load utils op: 0.4035146236419678 secondsTime to load utils op: 0.4032294750213623 seconds 18: 19: Time to load utils op: 0.40260934829711914 seconds 18: Time to load utils op: 0.4030916690826416 seconds 18: Time to load utils op: 0.403714656829834 seconds 18: Time to load utils op: 0.4031546115875244 seconds 16: Time to load utils op: 0.40526700019836426 secondsTime to load utils op: 0.4058246612548828 seconds 16: 16: Time to load utils op: 0.4057636260986328 seconds 16: Time to load utils op: 0.40586137771606445 seconds 18: Time to load utils op: 0.4036571979522705 seconds 19: Time to load utils op: 0.40273332595825195 seconds 19: Time to load utils op: 0.402843713760376 seconds 16: Time to load utils op: 0.40592527389526367 seconds 19: Time to load utils op: 0.40257930755615234 secondsTime to load utils op: 0.4025435447692871 seconds 19: 22: Time to load utils op: 0.6038057804107666 seconds 59: Time to load utils op: 0.3032875061035156 seconds 41: Time to load utils op: 0.20296788215637207 secondsTime to load utils op: 0.20223760604858398 seconds 41: 41: Time to load utils op: 0.20243072509765625 seconds 41: Time to load utils op: 0.2019641399383545 seconds 20: Time to load utils op: 0.4045834541320801 seconds 41: Time to load utils op: 0.20207810401916504 seconds 41: Time to load utils op: 0.2020864486694336 seconds 41: Time to load utils op: 0.20241594314575195 seconds 39: Time to load utils op: 0.20472979545593262 seconds 38: Time to load utils op: 0.20518064498901367 seconds 20: Time to load utils op: 0.40416884422302246 seconds 39: Time to load utils op: 0.20472145080566406 seconds 39: Time to load utils op: 0.20477867126464844 seconds 20: Time to load utils op: 0.40457797050476074 seconds 39: Time to load utils op: 0.20480990409851074 seconds 38: Time to load utils op: 0.20542120933532715 seconds 39: Time to load utils op: 0.20482873916625977 seconds 20: Time to load utils op: 0.40428853034973145 seconds 38: Time to load utils op: 0.20545530319213867 seconds 41: Time to load utils op: 0.20292425155639648 seconds 39: Time to load utils op: 0.2050166130065918 seconds 38: Time to load utils op: 0.20536088943481445 seconds 21: Time to load utils op: 0.403580904006958 seconds 20: Time to load utils op: 0.4049956798553467 seconds 39: Time to load utils op: 0.2049562931060791 seconds 39: Time to load utils op: 0.20496821403503418 seconds 38: Time to load utils op: 0.2057044506072998 seconds 21: Time to load utils op: 0.4035656452178955 secondsTime to load utils op: 0.40317654609680176 seconds 21: 20: Time to load utils op: 0.40468287467956543 seconds 38: Time to load utils op: 0.2056891918182373 seconds 20: Time to load utils op: 0.4051809310913086 seconds 21: Time to load utils op: 0.4042484760284424 seconds 21: Time to load utils op: 0.40425634384155273 seconds 21: Time to load utils op: 0.40396833419799805 seconds 38: Time to load utils op: 0.2060096263885498 seconds 20: Time to load utils op: 0.4047553539276123 seconds 21: Time to load utils op: 0.4037137031555176 seconds 38: Time to load utils op: 0.20602893829345703 seconds 40: Time to load utils op: 0.20410990715026855 seconds 21: Time to load utils op: 0.4046452045440674 seconds 40: Time to load utils op: 0.20373988151550293 seconds 40: Time to load utils op: 0.2039940357208252 seconds 40: Time to load utils op: 0.20353126525878906 seconds 22: Time to load utils op: 0.4026920795440674 secondsTime to load utils op: 0.40259742736816406 seconds 22: 40: Time to load utils op: 0.20360064506530762 seconds 40: Time to load utils op: 0.20404458045959473 seconds 40: Time to load utils op: 0.20418548583984375 seconds 40: Time to load utils op: 0.20454120635986328 seconds 22: Time to load utils op: 0.4022648334503174 seconds 22: Time to load utils op: 0.40230655670166016 seconds 22: Time to load utils op: 0.40258121490478516 seconds 22: Time to load utils op: 0.4026310443878174 seconds 22: Time to load utils op: 0.40270113945007324 seconds 42: Time to load utils op: 0.2032155990600586 seconds 42: Time to load utils op: 0.20252346992492676 seconds 42: Time to load utils op: 0.20259737968444824 seconds 42: Time to load utils op: 0.20269107818603516 seconds 42: Time to load utils op: 0.20328164100646973 seconds 42: Time to load utils op: 0.20270466804504395 seconds 42: Time to load utils op: 0.2028200626373291 seconds 42: Time to load utils op: 0.20296144485473633 seconds 24: Time to load utils op: 0.4035205841064453 seconds 24: Time to load utils op: 0.40362977981567383 seconds 24: Time to load utils op: 0.4040391445159912 secondsTime to load utils op: 0.4036281108856201 seconds 24: 24: Time to load utils op: 0.4034554958343506 seconds 24: Time to load utils op: 0.4043591022491455 seconds 24: Time to load utils op: 0.403775691986084 seconds 43: Time to load utils op: 0.20493459701538086 secondsTime to load utils op: 0.20438027381896973 seconds 43: 23: Time to load utils op: 0.4053471088409424 seconds 43: Time to load utils op: 0.20439958572387695 seconds 43: Time to load utils op: 0.20491266250610352 secondsTime to load utils op: 0.20478343963623047 seconds 43: 23: Time to load utils op: 0.4049959182739258 seconds 43: Time to load utils op: 0.20471549034118652 secondsTime to load utils op: 0.20483922958374023 seconds 43: 23: Time to load utils op: 0.4053068161010742 seconds 23: Time to load utils op: 0.40596508979797363 seconds 24: Time to load utils op: 0.4050600528717041 seconds 23: Time to load utils op: 0.40636539459228516 seconds 23: Time to load utils op: 0.4058663845062256 seconds 25: Time to load utils op: 0.40405726432800293 seconds 23: Time to load utils op: 0.40677905082702637 secondsTime to load utils op: 0.4067716598510742 seconds 23: 25: Time to load utils op: 0.4033026695251465 seconds 25: Time to load utils op: 0.4043464660644531 seconds 25: Time to load utils op: 0.403672456741333 secondsTime to load utils op: 0.40357470512390137 seconds 25: 25: Time to load utils op: 0.40467214584350586 seconds 25: Time to load utils op: 0.40385913848876953 seconds 44: Time to load utils op: 0.20502567291259766 seconds 25: Time to load utils op: 0.404435396194458 seconds 45: Time to load utils op: 0.2025132179260254 secondsTime to load utils op: 0.20251679420471191 secondsTime to load utils op: 0.2026834487915039 seconds 45: 45: 44: Time to load utils op: 0.20493721961975098 seconds 44: Time to load utils op: 0.2058568000793457 seconds 45: Time to load utils op: 0.20281982421875 seconds 44: Time to load utils op: 0.20573186874389648 seconds 44: Time to load utils op: 0.20573949813842773 seconds 44: Time to load utils op: 0.20529985427856445 seconds 45: Time to load utils op: 0.20286035537719727 secondsTime to load utils op: 0.20287656784057617 seconds 45: 45: Time to load utils op: 0.20374655723571777 seconds 32: Time to load utils op: 0.302473783493042 seconds 45: Time to load utils op: 0.20310449600219727 seconds 44: Time to load utils op: 0.2060835361480713 seconds 44: Time to load utils op: 0.2063767910003662 seconds 26: Time to load utils op: 0.403411865234375 seconds 26: Time to load utils op: 0.4040987491607666 seconds 26: Time to load utils op: 0.4048805236816406 secondsTime to load utils op: 0.4049975872039795 secondsTime to load utils op: 0.40428638458251953 seconds 26: 26: 26: Time to load utils op: 0.4036993980407715 seconds 32: Time to load utils op: 0.0003554821014404297 seconds 26: Time to load utils op: 0.40499424934387207 seconds 26: Time to load utils op: 0.40421128273010254 seconds 11: Time to load utils op: 0.0004954338073730469 seconds 11: Time to load utils op: 0.0004673004150390625 seconds 11: Time to load utils op: 0.0004475116729736328 seconds 32: Time to load utils op: 0.00039696693420410156 seconds 46: Time to load utils op: 0.20527887344360352 secondsTime to load utils op: 0.20498037338256836 seconds 46: 46: Time to load utils op: 0.20504212379455566 seconds 52: Time to load utils op: 0.0006935596466064453 seconds 12: Time to load utils op: 0.0005013942718505859 seconds 32: Time to load utils op: 0.0003612041473388672 seconds 46: Time to load utils op: 0.2056140899658203 seconds 12: Time to load utils op: 0.0005452632904052734 seconds 33: Time to load utils op: 0.0003173351287841797 seconds 46: Time to load utils op: 0.20507097244262695 seconds 32: Time to load utils op: 0.0003628730773925781 seconds 13: Time to load utils op: 0.0005052089691162109 seconds 12: Time to load utils op: 0.00037860870361328125 seconds 46: Time to load utils op: 0.2060246467590332 seconds 12: Time to load utils op: 0.00037026405334472656 seconds 13: Time to load utils op: 0.0003769397735595703 seconds 46: Time to load utils op: 0.20547175407409668 seconds 33: Time to load utils op: 0.00037980079650878906 seconds 32: Time to load utils op: 0.0003788471221923828 seconds 46: Time to load utils op: 0.2057340145111084 seconds 49: Time to load utils op: 0.20294499397277832 seconds 27: Time to load utils op: 0.4048454761505127 seconds 28: Time to load utils op: 0.40442943572998047 secondsTime to load utils op: 0.4043419361114502 seconds 28: 13: Time to load utils op: 0.0003409385681152344 seconds 33: Time to load utils op: 0.00037860870361328125 seconds 33: Time to load utils op: 0.00037384033203125 seconds 32: Time to load utils op: 0.0003600120544433594 seconds 49: Time to load utils op: 0.20327353477478027 seconds 27: Time to load utils op: 0.40506553649902344 secondsTime to load utils op: 0.4046051502227783 seconds 27: 27: Time to load utils op: 0.4050323963165283 seconds 28: Time to load utils op: 0.40389418601989746 seconds 49: Time to load utils op: 0.20275044441223145 seconds 12: Time to load utils op: 0.00035881996154785156 seconds 49: Time to load utils op: 0.20322632789611816 seconds 13: Time to load utils op: 0.0003311634063720703 seconds 27: Time to load utils op: 0.40474534034729004 seconds 12: Time to load utils op: 0.00037670135498046875 seconds 28: Time to load utils op: 0.40398526191711426 seconds 34: Time to load utils op: 0.00047326087951660156 seconds 47: Time to load utils op: 0.20499205589294434 seconds 49: Time to load utils op: 0.2026371955871582 seconds 27: Time to load utils op: 0.40483641624450684 seconds 33: Time to load utils op: 0.0003628730773925781 seconds 28: Time to load utils op: 0.40448427200317383 secondsTime to load utils op: 0.40436744689941406 seconds 28: 47: Time to load utils op: 0.20489978790283203 seconds 49: Time to load utils op: 0.2025434970855713 seconds 27: Time to load utils op: 0.4050426483154297 seconds 49: Time to load utils op: 0.20325589179992676 seconds 33: Time to load utils op: 0.0003464221954345703 seconds 12: Time to load utils op: 0.00030994415283203125 seconds 28: Time to load utils op: 0.40511107444763184 seconds 47: Time to load utils op: 0.20502638816833496 seconds 13: Time to load utils op: 0.0003819465637207031 seconds 27: Time to load utils op: 0.405642032623291 seconds 47: Time to load utils op: 0.20448541641235352 seconds 47: Time to load utils op: 0.204498291015625 seconds 28: Time to load utils op: 0.40501880645751953 seconds 14: Time to load utils op: 0.000469207763671875 seconds 33: Time to load utils op: 0.00036454200744628906 seconds 47: Time to load utils op: 0.2050793170928955 seconds 47: Time to load utils op: 0.2053220272064209 seconds 13: Time to load utils op: 0.0003762245178222656 seconds 13: Time to load utils op: 0.00037217140197753906 seconds 48: Time to load utils op: 0.20520615577697754 seconds 47: Time to load utils op: 0.20480632781982422 seconds 48: Time to load utils op: 0.2051699161529541 seconds 48: Time to load utils op: 0.20398259162902832 secondsTime to load utils op: 0.20470070838928223 seconds 48: 48: Time to load utils op: 0.20512700080871582 seconds 48: Time to load utils op: 0.20551156997680664 seconds 14: Time to load utils op: 0.0003867149353027344 seconds 48: Time to load utils op: 0.20554018020629883 seconds 48: Time to load utils op: 0.20498061180114746 seconds 14: Time to load utils op: 0.00033092498779296875 secondsTime to load utils op: 0.0003819465637207031 seconds 14: 34: Time to load utils op: 0.00040149688720703125 seconds 34: Time to load utils op: 0.0003578662872314453 seconds 14: Time to load utils op: 0.0003936290740966797 seconds 14: Time to load utils op: 0.0003695487976074219 seconds 14: Time to load utils op: 0.00035572052001953125 seconds 35: Time to load utils op: 0.00036406517028808594 seconds 34: Time to load utils op: 0.00035572052001953125 seconds 14: Time to load utils op: 0.00037217140197753906 seconds 50: Time to load utils op: 0.2029109001159668 secondsTime to load utils op: 0.20294404029846191 seconds 50: 15: Time to load utils op: 0.0005173683166503906 seconds 35: Time to load utils op: 0.00034928321838378906 seconds 34: Time to load utils op: 0.0003559589385986328 seconds 50: Time to load utils op: 0.20276117324829102 seconds 34: Time to load utils op: 0.0003833770751953125 seconds 50: Time to load utils op: 0.20327401161193848 secondsTime to load utils op: 0.2032480239868164 seconds 50: 51: Time to load utils op: 0.2039780616760254 seconds 19: Time to load utils op: 0.0005133152008056641 seconds 15: Time to load utils op: 0.0003764629364013672 seconds 37: Time to load utils op: 0.00032067298889160156 seconds 35: Time to load utils op: 0.0003650188446044922 seconds 50: Time to load utils op: 0.20382452011108398 secondsTime to load utils op: 0.20339703559875488 seconds 50: 51: Time to load utils op: 0.2025153636932373 seconds 51: Time to load utils op: 0.20272397994995117 seconds 34: Time to load utils op: 0.0003631114959716797 seconds 50: Time to load utils op: 0.2035236358642578 seconds 35: Time to load utils op: 0.000347137451171875 seconds 51: Time to load utils op: 0.20403051376342773 seconds 51: Time to load utils op: 0.2033843994140625 seconds 51: Time to load utils op: 0.20377445220947266 seconds 34: Time to load utils op: 0.0003609657287597656 seconds 51: Time to load utils op: 0.2037358283996582 seconds 15: Time to load utils op: 0.0003752708435058594 seconds 52: Time to load utils op: 0.20235490798950195 seconds 51: Time to load utils op: 0.20414972305297852 seconds 35: Time to load utils op: 0.00032806396484375 seconds 15: Time to load utils op: 0.0003781318664550781 seconds 52: Time to load utils op: 0.20192289352416992 seconds 17: Time to load utils op: 0.0004329681396484375 seconds 19: Time to load utils op: 0.00034880638122558594 seconds 15: Time to load utils op: 0.00037360191345214844 seconds 0: [2022-11-25 17:01:35,968] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer 37: Time to load utils op: 0.00040078163146972656 seconds 15: Time to load utils op: 0.0003178119659423828 seconds 52: Time to load utils op: 0.2020549774169922 secondsTime to load utils op: 0.20211076736450195 secondsTime to load utils op: 0.20276403427124023 seconds 52: 52: 52: Time to load utils op: 0.20221328735351562 seconds 35: Time to load utils op: 0.00036334991455078125 seconds 19: Time to load utils op: 0.0003597736358642578 seconds 52: Time to load utils op: 0.20209980010986328 seconds 35: Time to load utils op: 0.0003578662872314453 seconds 37: Time to load utils op: 0.0003402233123779297 seconds 0: [2022-11-25 17:01:35,968] [INFO] [utils.py:828:see_memory_usage] MA 11.42 GB Max_MA 11.42 GB CA 15.23 GB Max_CA 15 GB 36: Time to load utils op: 0.00043892860412597656 seconds 15: Time to load utils op: 0.0003635883331298828 secondsTime to load utils op: 0.0003943443298339844 seconds 15: 16: Time to load utils op: 0.00041961669921875 seconds 0: [2022-11-25 17:01:35,969] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 37.39 GB, percent = 7.4% 37: Time to load utils op: 0.00037384033203125 seconds 36: Time to load utils op: 0.00040268898010253906 seconds 53: Time to load utils op: 0.20309090614318848 secondsTime to load utils op: 0.20366549491882324 seconds 53: 0: [2022-11-25 17:01:35,969] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam 53: Time to load utils op: 0.20295047760009766 seconds 0: [2022-11-25 17:01:35,969] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler 17: Time to load utils op: 0.00037789344787597656 seconds 37: Time to load utils op: 0.0003590583801269531 seconds 0: [2022-11-25 17:01:35,969] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = 53: Time to load utils op: 0.20258712768554688 secondsTime to load utils op: 0.20302486419677734 seconds 53: 0: [2022-11-25 17:01:35,969] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 53: Time to load utils op: 0.2028515338897705 seconds 16: Time to load utils op: 0.0003268718719482422 seconds 0: [2022-11-25 17:01:35,969] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: 17: Time to load utils op: 0.0003705024719238281 seconds 36: Time to load utils op: 0.0003705024719238281 seconds 37: Time to load utils op: 0.00034046173095703125 seconds 53: Time to load utils op: 0.20328164100646973 seconds 53: Time to load utils op: 0.20285391807556152 seconds 0: [2022-11-25 17:01:35,969] [INFO] [config.py:1011:print] activation_checkpointing_config { 0: "partition_activations": false, 0: "contiguous_memory_optimization": false, 0: "cpu_checkpointing": false, 0: "number_checkpoints": null, 0: "synchronize_checkpoint_boundary": false, 0: "profile": false 0: } 16: Time to load utils op: 0.0002982616424560547 seconds 17: Time to load utils op: 0.00036072731018066406 secondsTime to load utils op: 0.0003590583801269531 seconds 17: 37: Time to load utils op: 0.00037932395935058594 seconds 0: [2022-11-25 17:01:35,969] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} 36: Time to load utils op: 0.0003848075866699219 seconds 0: [2022-11-25 17:01:35,969] [INFO] [config.py:1011:print] amp_enabled .................. False 0: [2022-11-25 17:01:35,969] [INFO] [config.py:1011:print] amp_params ................... False 36: Time to load utils op: 0.0003609657287597656 seconds 19: Time to load utils op: 0.0003235340118408203 seconds 17: Time to load utils op: 0.0003616809844970703 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] autotuning_config ............ { 0: "enabled": false, 0: "start_step": null, 0: "end_step": null, 0: "metric_path": null, 0: "arg_mappings": null, 0: "metric": "throughput", 0: "model_info": null, 0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", 0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", 0: "overwrite": true, 0: "fast": true, 0: "start_profile_step": 3, 0: "end_profile_step": 5, 0: "tuner_type": "gridsearch", 0: "tuner_early_stopping": 5, 0: "tuner_num_trials": 50, 0: "model_info_path": null, 0: "mp_size": 1, 0: "max_train_batch_size": null, 0: "min_train_batch_size": 1, 0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, 0: "min_train_micro_batch_size_per_gpu": 1, 0: "num_tuning_micro_batch_sizes": 3 0: } 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] bfloat16_enabled ............. True 22: Time to load utils op: 0.0004932880401611328 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False 17: Time to load utils op: 0.00033783912658691406 seconds 36: Time to load utils op: 0.00037789344787597656 seconds 19: Time to load utils op: 0.00034546852111816406 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False 16: Time to load utils op: 0.00041961669921875 secondsTime to load utils op: 0.00038433074951171875 seconds 16: 17: Time to load utils op: 0.0003104209899902344 seconds 36: Time to load utils op: 0.0003566741943359375 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] comms_config ................. 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] communication_data_type ...... None 18: Time to load utils op: 0.0005280971527099609 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa 18: Time to load utils op: 0.0003845691680908203 seconds 0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} 36: Time to load utils op: 0.0003726482391357422 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] curriculum_enabled ........... False 19: Time to load utils op: 0.0003275871276855469 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] curriculum_params ............ False 16: Time to load utils op: 0.00036907196044921875 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] dataloader_drop_last ......... False 16: Time to load utils op: 0.0003750324249267578 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] disable_allgather ............ False 18: Time to load utils op: 0.0003256797790527344 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] dump_state ................... False 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None 19: Time to load utils op: 0.0003542900085449219 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False 16: Time to load utils op: 0.00037384033203125 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer 19: Time to load utils op: 0.0003294944763183594 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 18: Time to load utils op: 0.0003323554992675781 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False 59: Time to load utils op: 0.000457763671875 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] elasticity_enabled ........... False 55: Time to load utils op: 0.2033097743988037 seconds 18: Time to load utils op: 0.0003428459167480469 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] flops_profiler_config ........ { 0: "enabled": false, 0: "profile_step": 1, 0: "module_depth": -1, 0: "top_modules": 1, 0: "detailed": true, 0: "output_file": null 0: } 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] fp16_auto_cast ............... None 55: Time to load utils op: 0.20368170738220215 secondsTime to load utils op: 0.2025618553161621 secondsTime to load utils op: 0.20309066772460938 secondsTime to load utils op: 0.20227456092834473 seconds 55: 55: 55: 18: Time to load utils op: 0.0003199577331542969 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] fp16_enabled ................. False 55: Time to load utils op: 0.20323634147644043 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] global_rank .................. 0 55: Time to load utils op: 0.2034304141998291 seconds 18: Time to load utils op: 0.00037550926208496094 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 2 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 55: Time to load utils op: 0.20317411422729492 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 20: Time to load utils op: 0.00044918060302734375 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 18: Time to load utils op: 0.00036406517028808594 seconds 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] load_universal_checkpoint .... False 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] loss_scale ................... 1.0 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] memory_breakdown ............. False 0: [2022-11-25 17:01:35,970] [INFO] [config.py:1011:print] monitor_config ............... 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] nebula_config ................ { 0: "enabled": false, 0: "persistent_storage_path": null, 0: "persistent_time_interval": 100, 0: "num_of_version_in_retention": 2, 0: "enable_nebula_load": true, 0: "load_path": null 0: } 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False 54: Time to load utils op: 0.20505189895629883 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] optimizer_name ............... None 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] optimizer_params ............. None 41: Time to load utils op: 0.0004868507385253906 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] pld_enabled .................. False 54: Time to load utils op: 0.20508098602294922 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] pld_params ................... False 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] prescale_gradients ........... False 54: Time to load utils op: 0.2051219940185547 secondsTime to load utils op: 0.20505547523498535 seconds 54: 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] scheduler_name ............... None 54: Time to load utils op: 0.20509672164916992 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] scheduler_params ............. None 54: Time to load utils op: 0.20527935028076172 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] sparse_attention ............. None 20: Time to load utils op: 0.0003390312194824219 seconds 39: Time to load utils op: 0.0005736351013183594 seconds 54: Time to load utils op: 0.20512175559997559 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False 22: Time to load utils op: 0.00038552284240722656 seconds 20: Time to load utils op: 0.0004303455352783203 seconds 21: Time to load utils op: 0.0004246234893798828 seconds 54: Time to load utils op: 0.2053534984588623 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] steps_per_print .............. 2000 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] train_batch_size ............. 512 20: Time to load utils op: 0.0003483295440673828 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 1 21: Time to load utils op: 0.0004050731658935547 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] use_node_local_storage ....... False 39: Time to load utils op: 0.0003211498260498047 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False 41: Time to load utils op: 0.0003535747528076172 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] world_size ................... 256 22: Time to load utils op: 0.0003628730773925781 seconds 41: Time to load utils op: 0.0003502368927001953 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False 56: Time to load utils op: 0.2040877342224121 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False 40: Time to load utils op: 0.0004680156707763672 seconds 21: Time to load utils op: 0.0003552436828613281 seconds 56: Time to load utils op: 0.20403075218200684 secondsTime to load utils op: 0.20435571670532227 seconds 56: 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] zero_enabled ................. False 20: Time to load utils op: 0.0003800392150878906 seconds 41: Time to load utils op: 0.0003113746643066406 seconds 39: Time to load utils op: 0.0004246234893798828 seconds 0: [2022-11-25 17:01:35,971] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 20: Time to load utils op: 0.0003707408905029297 seconds 56: Time to load utils op: 0.20463109016418457 secondsTime to load utils op: 0.20399999618530273 seconds 56: 0: [2022-11-25 17:01:35,971] [INFO] [config.py:996:print_user_config] json = { 0: "train_micro_batch_size_per_gpu": 1, 0: "train_batch_size": 512, 0: "gradient_clipping": 1.0, 0: "zero_optimization": { 0: "stage": 0 0: }, 0: "bf16": { 0: "enabled": true 0: }, 0: "steps_per_print": 2.000000e+03, 0: "wall_clock_breakdown": false 0: } 38: Time to load utils op: 0.0004470348358154297 seconds 56: Time to load utils op: 0.2040400505065918 seconds 41: Time to load utils op: 0.0003123283386230469 seconds 21: Time to load utils op: 0.0003719329833984375 seconds 56: Time to load utils op: 0.20497918128967285 seconds 21: Time to load utils op: 0.0003504753112792969 seconds 0: Time to load utils op: 0.0005283355712890625 seconds 22: Time to load utils op: 0.0003256797790527344 secondsTime to load utils op: 0.0003941059112548828 seconds 22: 22: Time to load utils op: 0.00039887428283691406 seconds 20: Time to load utils op: 0.0003771781921386719 seconds 39: Time to load utils op: 0.0004265308380126953 seconds 39: Time to load utils op: 0.0004189014434814453 seconds 38: Time to load utils op: 0.0005946159362792969 seconds 57: Time to load utils op: 0.2029430866241455 secondsTime to load utils op: 0.20274019241333008 seconds 57: 57: Time to load utils op: 0.20295476913452148 seconds 56: Time to load utils op: 0.20497488975524902 seconds 0: [2022-11-25 17:01:35,972] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=2 micro_batch_size=1 20: Time to load utils op: 0.00037860870361328125 seconds 40: Time to load utils op: 0.00035119056701660156 seconds 41: Time to load utils op: 0.0003657341003417969 seconds 38: Time to load utils op: 0.00037980079650878906 seconds 57: Time to load utils op: 0.20279479026794434 seconds 22: Time to load utils op: 0.00037360191345214844 seconds 39: Time to load utils op: 0.00039005279541015625 seconds 21: Time to load utils op: 0.00036716461181640625 seconds 57: Time to load utils op: 0.2029588222503662 seconds 40: Time to load utils op: 0.0003275871276855469 seconds 38: Time to load utils op: 0.0003077983856201172 seconds 41: Time to load utils op: 0.0003330707550048828 seconds 39: Time to load utils op: 0.00036597251892089844 seconds 57: Time to load utils op: 0.20357012748718262 seconds 22: Time to load utils op: 0.0003514289855957031 seconds 57: Time to load utils op: 0.20310139656066895 seconds 21: Time to load utils op: 0.00040030479431152344 seconds 41: Time to load utils op: 0.00033283233642578125 seconds 40: Time to load utils op: 0.0003409385681152344 seconds 39: Time to load utils op: 0.00037169456481933594 seconds 57: Time to load utils op: 0.20321345329284668 seconds 21: Time to load utils op: 0.0003731250762939453 seconds 38: Time to load utils op: 0.00037384033203125 seconds 40: Time to load utils op: 0.00036263465881347656 seconds 38: Time to load utils op: 0.00037169456481933594 seconds 38: Time to load utils op: 0.0003669261932373047 seconds 40: Time to load utils op: 0.00035643577575683594 seconds 40: Time to load utils op: 0.0003037452697753906 seconds 42: Time to load utils op: 0.0004849433898925781 seconds 38: Time to load utils op: 0.00037670135498046875 seconds 42: Time to load utils op: 0.0003540515899658203 seconds 58: Time to load utils op: 0.20337438583374023 secondsTime to load utils op: 0.20305705070495605 seconds 58: 40: Time to load utils op: 0.0003542900085449219 seconds 42: Time to load utils op: 0.00035643577575683594 seconds 58: Time to load utils op: 0.20375657081604004 seconds 58: Time to load utils op: 0.20376086235046387 seconds 42: Time to load utils op: 0.00034928321838378906 seconds 58: Time to load utils op: 0.2031097412109375 seconds 58: Time to load utils op: 0.20383620262145996 seconds 42: Time to load utils op: 0.00034546852111816406 seconds 59: Time to load utils op: 0.20224404335021973 secondsTime to load utils op: 0.20245647430419922 seconds 59: 58: Time to load utils op: 0.20381784439086914 seconds 59: Time to load utils op: 0.20250439643859863 secondsTime to load utils op: 0.20250320434570312 secondsTime to load utils op: 0.20235490798950195 secondsTime to load utils op: 0.20229792594909668 seconds 59: 59: 59: Time to load utils op: 0.20236778259277344 seconds 59: 58: Time to load utils op: 0.2041475772857666 seconds 42: Time to load utils op: 0.0003275871276855469 seconds 24: Time to load utils op: 0.0005023479461669922 seconds 42: Time to load utils op: 0.0003426074981689453 seconds 42: Time to load utils op: 0.0003581047058105469 seconds 43: Time to load utils op: 0.0003261566162109375 seconds 24: Time to load utils op: 0.00032329559326171875 seconds 43: Time to load utils op: 0.0003254413604736328 seconds 24: Time to load utils op: 0.0003628730773925781 seconds 43: Time to load utils op: 0.0004374980926513672 secondsTime to load utils op: 0.00042366981506347656 seconds 43: 23: Time to load utils op: 0.000484466552734375 seconds 23: Time to load utils op: 0.0003540515899658203 seconds 24: Time to load utils op: 0.00032830238342285156 seconds 32: Time to load utils op: 0.0003695487976074219 seconds 23: Time to load utils op: 0.00035071372985839844 seconds 24: Time to load utils op: 0.00032401084899902344 seconds 43: Time to load utils op: 0.0003910064697265625 seconds 25: Time to load utils op: 0.0004684925079345703 seconds 43: Time to load utils op: 0.0003917217254638672 seconds 43: Time to load utils op: 0.0003380775451660156 seconds 23: Time to load utils op: 0.0003597736358642578 seconds 24: Time to load utils op: 0.0004048347473144531 seconds 24: Time to load utils op: 0.00042366981506347656 seconds 23: Time to load utils op: 0.0003409385681152344 seconds 60: Time to load utils op: 0.2027449607849121 seconds 24: Time to load utils op: 0.00036406517028808594 seconds 25: Time to load utils op: 0.0003147125244140625 seconds 60: Time to load utils op: 0.2028367519378662 seconds 23: Time to load utils op: 0.0003311634063720703 seconds 60: Time to load utils op: 0.20281267166137695 secondsTime to load utils op: 0.20285248756408691 seconds 60: 60: Time to load utils op: 0.20300841331481934 seconds 23: Time to load utils op: 0.00031375885009765625 seconds 44: Time to load utils op: 0.0004582405090332031 seconds 45: Time to load utils op: 0.000514984130859375 seconds 61: Time to load utils op: 0.20387697219848633 secondsTime to load utils op: 0.20403742790222168 seconds 61: 60: Time to load utils op: 0.20372247695922852 seconds 25: Time to load utils op: 0.000347137451171875 seconds 44: Time to load utils op: 0.0004391670227050781 seconds 45: Time to load utils op: 0.0003829002380371094 seconds 61: Time to load utils op: 0.20381784439086914 seconds 60: Time to load utils op: 0.20327472686767578 seconds 23: Time to load utils op: 0.00031638145446777344 seconds 61: Time to load utils op: 0.20393800735473633 seconds 25: Time to load utils op: 0.00032067298889160156 seconds 44: Time to load utils op: 0.0003185272216796875 seconds 45: Time to load utils op: 0.0003535747528076172 seconds 61: Time to load utils op: 0.2039029598236084 seconds 61: Time to load utils op: 0.20346760749816895 seconds 61: Time to load utils op: 0.2034931182861328 seconds 60: Time to load utils op: 0.20463919639587402 seconds 25: Time to load utils op: 0.00040793418884277344 seconds 44: Time to load utils op: 0.00032830238342285156 seconds 45: Time to load utils op: 0.00037789344787597656 seconds 45: Time to load utils op: 0.0003631114959716797 seconds 61: Time to load utils op: 0.20371031761169434 seconds 25: Time to load utils op: 0.0003643035888671875 seconds 44: Time to load utils op: 0.0003752708435058594 seconds 25: Time to load utils op: 0.0003724098205566406 seconds 45: Time to load utils op: 0.00038051605224609375 seconds 44: Time to load utils op: 0.0003170967102050781 seconds 45: Time to load utils op: 0.0003750324249267578 seconds 25: Time to load utils op: 0.0003597736358642578 seconds 44: Time to load utils op: 0.0003635883331298828 seconds 45: Time to load utils op: 0.00030541419982910156 seconds 44: Time to load utils op: 0.00034689903259277344 seconds 46: Time to load utils op: 0.0005068778991699219 seconds 62: Time to load utils op: 0.20246648788452148 seconds 62: Time to load utils op: 0.20300912857055664 seconds 26: Time to load utils op: 0.00046372413635253906 seconds 62: Time to load utils op: 0.20246458053588867 seconds 26: Time to load utils op: 0.0004057884216308594 seconds 62: Time to load utils op: 0.20335698127746582 seconds 26: Time to load utils op: 0.0003387928009033203 seconds 62: Time to load utils op: 0.2025752067565918 seconds 62: Time to load utils op: 0.2028522491455078 seconds 26: Time to load utils op: 0.00034546852111816406 seconds 62: Time to load utils op: 0.20293140411376953 seconds 46: Time to load utils op: 0.0003676414489746094 seconds 46: Time to load utils op: 0.000362396240234375 seconds 62: Time to load utils op: 0.20313072204589844 seconds 28: Time to load utils op: 0.00045990943908691406 seconds 46: Time to load utils op: 0.0003604888916015625 seconds 63: Time to load utils op: 0.20451569557189941 seconds 26: Time to load utils op: 0.0003666877746582031 seconds 46: Time to load utils op: 0.00029921531677246094 seconds 63: Time to load utils op: 0.2035083770751953 secondsTime to load utils op: 0.20438790321350098 seconds 63: 63: Time to load utils op: 0.20474624633789062 seconds 26: Time to load utils op: 0.00037384033203125 seconds 46: Time to load utils op: 0.0003342628479003906 seconds 47: Time to load utils op: 0.00043487548828125 seconds 63: Time to load utils op: 0.20473480224609375 secondsTime to load utils op: 0.20441269874572754 seconds 63: 47: Time to load utils op: 0.0003657341003417969 seconds 26: Time to load utils op: 0.00036334991455078125 seconds 63: Time to load utils op: 0.20406484603881836 seconds 26: Time to load utils op: 0.00038361549377441406 seconds 46: Time to load utils op: 0.000316619873046875 seconds 63: Time to load utils op: 0.2040877342224121 seconds 27: Time to load utils op: 0.00042700767517089844 seconds 28: Time to load utils op: 0.00040340423583984375 seconds 28: Time to load utils op: 0.00034236907958984375 seconds 47: Time to load utils op: 0.0003790855407714844 seconds 27: Time to load utils op: 0.00031638145446777344 seconds 46: Time to load utils op: 0.0003209114074707031 seconds 28: Time to load utils op: 0.0003402233123779297 seconds 49: Time to load utils op: 0.000335693359375 seconds 47: Time to load utils op: 0.00035953521728515625 seconds 49: Time to load utils op: 0.00032806396484375 seconds 27: Time to load utils op: 0.00036644935607910156 seconds 27: Time to load utils op: 0.0004215240478515625 secondsTime to load utils op: 0.00040984153747558594 secondsTime to load utils op: 0.00040984153747558594 seconds 27: 27: 28: Time to load utils op: 0.00034499168395996094 seconds 47: Time to load utils op: 0.00038433074951171875 seconds 28: Time to load utils op: 0.0003707408905029297 seconds 47: Time to load utils op: 0.00037741661071777344 seconds 49: Time to load utils op: 0.00030493736267089844 seconds 27: Time to load utils op: 0.0003325939178466797 seconds 28: Time to load utils op: 0.00035834312438964844 seconds 47: Time to load utils op: 0.00037407875061035156 seconds 47: Time to load utils op: 0.00032711029052734375 seconds 49: Time to load utils op: 0.00037217140197753906 seconds 28: Time to load utils op: 0.00035643577575683594 seconds 49: Time to load utils op: 0.0003826618194580078 seconds 27: Time to load utils op: 0.0003745555877685547 seconds 48: Time to load utils op: 0.0005550384521484375 seconds 48: Time to load utils op: 0.0004906654357910156 seconds 49: Time to load utils op: 0.0003561973571777344 seconds 49: Time to load utils op: 0.00035262107849121094 seconds 48: Time to load utils op: 0.0003173351287841797 seconds 48: Time to load utils op: 0.0004069805145263672 seconds 48: Time to load utils op: 0.00038886070251464844 seconds 52: Time to load utils op: 0.00032520294189453125 seconds 50: Time to load utils op: 0.0004589557647705078 seconds 48: Time to load utils op: 0.00032901763916015625 secondsTime to load utils op: 0.00034546852111816406 seconds 48: 48: Time to load utils op: 0.00041031837463378906 seconds 52: Time to load utils op: 0.00032782554626464844 seconds 50: Time to load utils op: 0.0003972053527832031 secondsTime to load utils op: 0.0004038810729980469 seconds 50: 50: Time to load utils op: 0.00035119056701660156 seconds 51: Time to load utils op: 0.0004596710205078125 seconds 51: Time to load utils op: 0.0004417896270751953 seconds 52: Time to load utils op: 0.000396728515625 seconds 52: Time to load utils op: 0.00035452842712402344 seconds 50: Time to load utils op: 0.00033664703369140625 secondsTime to load utils op: 0.0003390312194824219 seconds 50: 51: Time to load utils op: 0.00032210350036621094 seconds 51: Time to load utils op: 0.0003943443298339844 seconds 52: Time to load utils op: 0.0002989768981933594 seconds 50: Time to load utils op: 0.0003681182861328125 seconds 50: Time to load utils op: 0.0003581047058105469 seconds 52: Time to load utils op: 0.0003616809844970703 seconds 51: Time to load utils op: 0.000331878662109375 seconds 51: Time to load utils op: 0.0003864765167236328 seconds 52: Time to load utils op: 0.00038313865661621094 seconds 51: Time to load utils op: 0.00037217140197753906 seconds 51: Time to load utils op: 0.00038242340087890625 seconds 53: Time to load utils op: 0.00047588348388671875 seconds 53: Time to load utils op: 0.0005571842193603516 seconds 53: Time to load utils op: 0.0003807544708251953 seconds 53: Time to load utils op: 0.0003788471221923828 seconds 53: Time to load utils op: 0.00033664703369140625 seconds 53: Time to load utils op: 0.00032711029052734375 seconds 53: Time to load utils op: 0.0004055500030517578 seconds 53: Time to load utils op: 0.0003993511199951172 seconds 54: Time to load utils op: 0.000461578369140625 seconds 55: Time to load utils op: 0.0003490447998046875 seconds 55: Time to load utils op: 0.0005590915679931641 seconds 55: Time to load utils op: 0.0004012584686279297 seconds 55: Time to load utils op: 0.0003933906555175781 seconds 54: Time to load utils op: 0.0003070831298828125 seconds 55: Time to load utils op: 0.0003757476806640625 seconds 57: Time to load utils op: 0.0005092620849609375 seconds 55: Time to load utils op: 0.0003268718719482422 seconds 55: Time to load utils op: 0.0003859996795654297 seconds 54: Time to load utils op: 0.0004010200500488281 seconds 55: Time to load utils op: 0.0003695487976074219 seconds 54: Time to load utils op: 0.00033354759216308594 seconds 56: Time to load utils op: 0.0004482269287109375 seconds 54: Time to load utils op: 0.0003917217254638672 seconds 56: Time to load utils op: 0.00039315223693847656 seconds 57: Time to load utils op: 0.0003764629364013672 seconds 54: Time to load utils op: 0.0003399848937988281 seconds 57: Time to load utils op: 0.00038123130798339844 seconds 57: Time to load utils op: 0.0003743171691894531 seconds 54: Time to load utils op: 0.0003628730773925781 seconds 54: Time to load utils op: 0.0003612041473388672 seconds 56: Time to load utils op: 0.0003554821014404297 seconds 56: Time to load utils op: 0.0003306865692138672 seconds 57: Time to load utils op: 0.00034356117248535156 seconds 56: Time to load utils op: 0.0003211498260498047 seconds 57: Time to load utils op: 0.00034499168395996094 seconds 57: Time to load utils op: 0.00038933753967285156 seconds 57: Time to load utils op: 0.00037360191345214844 seconds 56: Time to load utils op: 0.0003662109375 seconds 56: Time to load utils op: 0.00036263465881347656 seconds 56: Time to load utils op: 0.00039696693420410156 seconds 59: Time to load utils op: 0.00032901763916015625 seconds 59: Time to load utils op: 0.00031065940856933594 seconds 58: Time to load utils op: 0.0003829002380371094 secondsTime to load utils op: 0.0005288124084472656 seconds 58: 59: Time to load utils op: 0.0004208087921142578 seconds 59: Time to load utils op: 0.00037169456481933594 seconds 58: Time to load utils op: 0.0003597736358642578 seconds 58: Time to load utils op: 0.0003509521484375 seconds 59: Time to load utils op: 0.0003676414489746094 seconds 58: Time to load utils op: 0.000301361083984375 seconds 59: Time to load utils op: 0.00037384033203125 seconds 58: Time to load utils op: 0.000362396240234375 seconds 58: Time to load utils op: 0.0003638267517089844 seconds 58: Time to load utils op: 0.0003631114959716797 seconds 59: Time to load utils op: 0.0003681182861328125 seconds 60: Time to load utils op: 0.00042891502380371094 seconds 60: Time to load utils op: 0.00042891502380371094 seconds 61: Time to load utils op: 0.0004489421844482422 seconds 61: Time to load utils op: 0.0004696846008300781 seconds 60: Time to load utils op: 0.0003821849822998047 seconds 61: Time to load utils op: 0.000331878662109375 seconds 60: Time to load utils op: 0.0003809928894042969 seconds 60: Time to load utils op: 0.0003788471221923828 seconds 61: Time to load utils op: 0.0003190040588378906 seconds 60: Time to load utils op: 0.0003409385681152344 seconds 60: Time to load utils op: 0.0003323554992675781 seconds 61: Time to load utils op: 0.0004012584686279297 seconds 60: Time to load utils op: 0.00038909912109375 seconds 61: Time to load utils op: 0.00035834312438964844 seconds 61: Time to load utils op: 0.0003631114959716797 seconds 61: Time to load utils op: 0.00035858154296875 seconds 62: Time to load utils op: 0.0004963874816894531 seconds 62: Time to load utils op: 0.0003628730773925781 seconds 62: Time to load utils op: 0.0003428459167480469 seconds 62: Time to load utils op: 0.00030684471130371094 seconds 62: Time to load utils op: 0.0003731250762939453 seconds 62: Time to load utils op: 0.0003457069396972656 seconds 62: Time to load utils op: 0.0003464221954345703 seconds 63: Time to load utils op: 0.0004930496215820312 seconds 62: Time to load utils op: 0.0003237724304199219 seconds 63: Time to load utils op: 0.0003581047058105469 seconds 63: Time to load utils op: 0.0003941059112548828 seconds 63: Time to load utils op: 0.0003592967987060547 seconds 63: Time to load utils op: 0.00036907196044921875 seconds 63: Time to load utils op: 0.00035762786865234375 seconds 63: Time to load utils op: 0.0003559589385986328 seconds 63: Time to load utils op: 0.00030994415283203125 seconds 0: [2022-11-25 17:01:36,142] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=21 [0, 21) STAGE_PARAMS=2026914560 (2026.915M) TOTAL_PARAMS=4053835008 (4053.835M) UNIQUE_PARAMS=3899710720 (3899.711M) 32: [2022-11-25 17:01:36,142] [INFO] [engine.py:145:__init__] RANK=256 STAGE=1 LAYERS=22 [21, 43) STAGE_PARAMS=2026920448 (2026.920M) TOTAL_PARAMS=4053835008 (4053.835M) UNIQUE_PARAMS=3899710720 (3899.711M) 0: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: WARNING: could not find the metadata file checkpoints_3b9 0: will not load any checkpoints and will start from random 32: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,572] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 17:01:38,573] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 17:01:38,574] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_3b9/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: time (ms) | load-checkpoint: 11.45 0: estimated model parameters: 4.05382912 0: estimated model parameters without embeddings: 3.745580544 0: [after model, optimizer, and learning rate scheduler are built] datetime: 2022-11-25 17:01:38 0: > building train, validation, and test datasets ... 0: > datasets target sizes (minimum size): 0: train: 12505484 0: validation: 12800 0: test: 512 0: > building train, validation, and test datasets for GPT ... 0: > building dataset index ... 0: reading sizes... 0: reading pointers... 0: reading document index... 0: creating numpy buffer of mmap... 0: creating memory view of numpy buffer... 0: > finished creating indexed dataset in 0.001837 seconds 0: number of documents: 210604984 0: > dataset split: 0: train: 0: document indices in [0, 199864130) total of 199864130 documents 0: validation: 0: document indices in [199864130, 210394379) total of 10530249 documents 0: test: 0: document indices in [210394379, 210604984) total of 210605 documents 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_12505484ns_2048sl_1234s_doc_idx.npy 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_12505484ns_2048sl_1234s_sample_idx.npy 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_12505484ns_2048sl_1234s_shuffle_idx.npy 0: loaded indexed file in 0.006 seconds 0: total number of samples: 173377817 0: total number of epochs: 1 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_12800ns_2048sl_1234s_doc_idx.npy 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_12800ns_2048sl_1234s_sample_idx.npy 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_12800ns_2048sl_1234s_shuffle_idx.npy 0: loaded indexed file in 0.008 seconds 0: total number of samples: 9118345 0: total number of epochs: 1 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_512ns_2048sl_1234s_doc_idx.npy 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_512ns_2048sl_1234s_sample_idx.npy 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_512ns_2048sl_1234s_shuffle_idx.npy 0: loaded indexed file in 0.003 seconds 0: total number of samples: 182928 0: total number of epochs: 1 0: > finished creating GPT datasets ... 0: [after dataloaders are built] datetime: 2022-11-25 17:01:57 0: done with setup ... 0: training ... 0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: 63: time (ms) | model-and-optimizer-setup: 45014.82 | train/valid/test-data-iterators-setup: 18685.00 32: [000-001] 4.0538B / 3.7456B 0: [000-000] 4.0538B / 3.7456B 0: [before the start of training step] datetime: 2022-11-25 17:01:58 0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 15589.80810546875 | max allocated: 46167.35498046875 | reserved: 49744.0 | max reserved: 49744.0 32: [Rank 256] (after 10 iterations) memory (MB) | allocated: 16405.80615234375 | max allocated: 32144.1962890625 | reserved: 38584.0 | max reserved: 38584.0 63: iteration 10/ 24424 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 4.99 | learning rate: 8.188E-06 | global batch size: 512 | lm loss: 1.118308E+01 | grad norm: 19.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 102.515 | TFLOPs: 10.55 | 63: iteration 20/ 24424 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 2.27 | learning rate: 1.638E-05 | global batch size: 512 | lm loss: 9.151855E+00 | grad norm: 10.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.192 | TFLOPs: 23.18 | 63: iteration 30/ 24424 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 2.27 | learning rate: 2.457E-05 | global batch size: 512 | lm loss: 8.236140E+00 | grad norm: 4.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.118 | TFLOPs: 23.17 | 63: iteration 40/ 24424 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 2.28 | learning rate: 3.275E-05 | global batch size: 512 | lm loss: 7.612685E+00 | grad norm: 2.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.021 | TFLOPs: 23.16 | 63: iteration 50/ 24424 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 2.27 | learning rate: 4.094E-05 | global batch size: 512 | lm loss: 7.297845E+00 | grad norm: 2.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.678 | TFLOPs: 23.23 | 63: iteration 60/ 24424 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 2.31 | learning rate: 4.913E-05 | global batch size: 512 | lm loss: 7.106538E+00 | grad norm: 1.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.271 | TFLOPs: 22.78 | 63: iteration 70/ 24424 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 2.29 | learning rate: 5.732E-05 | global batch size: 512 | lm loss: 6.980643E+00 | grad norm: 1.977 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.175 | TFLOPs: 22.97 | 63: iteration 80/ 24424 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 2.27 | learning rate: 6.551E-05 | global batch size: 512 | lm loss: 6.819228E+00 | grad norm: 2.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.029 | TFLOPs: 23.27 | 63: iteration 90/ 24424 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 2.27 | learning rate: 7.370E-05 | global batch size: 512 | lm loss: 6.666166E+00 | grad norm: 3.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.294 | TFLOPs: 23.19 | 63: iteration 100/ 24424 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 2.27 | learning rate: 8.188E-05 | global batch size: 512 | lm loss: 6.506441E+00 | grad norm: 2.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.017 | TFLOPs: 23.27 | 63: iteration 110/ 24424 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 2.30 | learning rate: 9.007E-05 | global batch size: 512 | lm loss: 6.354548E+00 | grad norm: 1.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.350 | TFLOPs: 22.89 | 63: iteration 120/ 24424 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 2.25 | learning rate: 9.826E-05 | global batch size: 512 | lm loss: 6.271078E+00 | grad norm: 1.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.061 | TFLOPs: 23.37 | 63: iteration 130/ 24424 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 2.26 | learning rate: 1.064E-04 | global batch size: 512 | lm loss: 6.127692E+00 | grad norm: 1.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.551 | TFLOPs: 23.32 | 63: iteration 140/ 24424 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 2.32 | learning rate: 1.146E-04 | global batch size: 512 | lm loss: 6.002659E+00 | grad norm: 1.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.568 | TFLOPs: 22.71 | 63: iteration 150/ 24424 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 2.23 | learning rate: 1.228E-04 | global batch size: 512 | lm loss: 5.897334E+00 | grad norm: 1.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.106 | TFLOPs: 23.59 | 63: iteration 160/ 24424 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 2.25 | learning rate: 1.310E-04 | global batch size: 512 | lm loss: 5.865603E+00 | grad norm: 2.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.977 | TFLOPs: 23.47 | 63: iteration 170/ 24424 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 2.28 | learning rate: 1.392E-04 | global batch size: 512 | lm loss: 5.793326E+00 | grad norm: 1.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.465 | TFLOPs: 23.11 | 63: iteration 180/ 24424 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 2.25 | learning rate: 1.474E-04 | global batch size: 512 | lm loss: 5.690867E+00 | grad norm: 0.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.693 | TFLOPs: 23.44 | 63: iteration 190/ 24424 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 2.25 | learning rate: 1.556E-04 | global batch size: 512 | lm loss: 5.643260E+00 | grad norm: 0.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.524 | TFLOPs: 23.42 | 63: iteration 200/ 24424 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 2.28 | learning rate: 1.638E-04 | global batch size: 512 | lm loss: 5.577538E+00 | grad norm: 1.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.592 | TFLOPs: 23.12 | 63: iteration 210/ 24424 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 2.23 | learning rate: 1.720E-04 | global batch size: 512 | lm loss: 5.546515E+00 | grad norm: 0.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.427 | TFLOPs: 23.62 | 63: iteration 220/ 24424 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 2.25 | learning rate: 1.801E-04 | global batch size: 512 | lm loss: 5.500638E+00 | grad norm: 1.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.847 | TFLOPs: 23.46 | 63: iteration 230/ 24424 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 2.30 | learning rate: 1.883E-04 | global batch size: 512 | lm loss: 5.458183E+00 | grad norm: 1.094 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.441 | TFLOPs: 22.90 | 63: iteration 240/ 24424 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 2.29 | learning rate: 1.965E-04 | global batch size: 512 | lm loss: 5.411374E+00 | grad norm: 1.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.662 | TFLOPs: 23.02 | 63: iteration 250/ 24424 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 2.28 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.334759E+00 | grad norm: 1.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.887 | TFLOPs: 23.15 | 63: iteration 260/ 24424 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 2.29 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.290163E+00 | grad norm: 1.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.192 | TFLOPs: 22.98 | 63: iteration 270/ 24424 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 2.31 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.260022E+00 | grad norm: 0.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.777 | TFLOPs: 22.83 | 63: iteration 280/ 24424 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 2.27 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.261946E+00 | grad norm: 0.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.864 | TFLOPs: 23.25 | 63: iteration 290/ 24424 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.157094E+00 | grad norm: 1.114 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.408 | TFLOPs: 23.62 | 63: iteration 300/ 24424 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 2.25 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.145644E+00 | grad norm: 1.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.236 | TFLOPs: 23.39 | 63: iteration 310/ 24424 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.065110E+00 | grad norm: 0.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.252 | TFLOPs: 23.60 | 63: iteration 320/ 24424 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 2.25 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 5.042933E+00 | grad norm: 0.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.355 | TFLOPs: 23.41 | 63: iteration 330/ 24424 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 2.28 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.998720E+00 | grad norm: 0.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.481 | TFLOPs: 23.11 | 63: iteration 340/ 24424 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.914594E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.275 | TFLOPs: 23.60 | 63: iteration 350/ 24424 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 2.24 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.903586E+00 | grad norm: 1.118 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.218 | TFLOPs: 23.49 | 63: iteration 360/ 24424 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 2.26 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.911346E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.368 | TFLOPs: 23.30 | 63: iteration 370/ 24424 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.826572E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.964 | TFLOPs: 23.67 | 63: iteration 380/ 24424 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.784616E+00 | grad norm: 0.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.625 | TFLOPs: 23.64 | 63: iteration 390/ 24424 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.768908E+00 | grad norm: 0.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.533 | TFLOPs: 23.63 | 63: iteration 400/ 24424 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 2.27 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.697777E+00 | grad norm: 0.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.119 | TFLOPs: 23.17 | 63: iteration 410/ 24424 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.662601E+00 | grad norm: 0.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.832 | TFLOPs: 23.66 | 63: iteration 420/ 24424 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 2.67 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.625981E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 191.629 | TFLOPs: 19.73 | 63: iteration 430/ 24424 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.587108E+00 | grad norm: 1.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.279 | TFLOPs: 23.60 | 63: iteration 440/ 24424 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 2.25 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.631916E+00 | grad norm: 0.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.376 | TFLOPs: 23.41 | 63: iteration 450/ 24424 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 2.24 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.543847E+00 | grad norm: 0.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.539 | TFLOPs: 23.53 | 63: iteration 460/ 24424 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 2.26 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.504483E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.907 | TFLOPs: 23.36 | 63: iteration 470/ 24424 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.381422E+00 | grad norm: 0.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.813 | TFLOPs: 23.66 | 63: iteration 480/ 24424 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 2.23 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.382100E+00 | grad norm: 0.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.847 | TFLOPs: 23.66 | 63: iteration 490/ 24424 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 2.25 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.330690E+00 | grad norm: 0.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.694 | TFLOPs: 23.44 | 63: iteration 500/ 24424 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 2.24 | learning rate: 2.000E-04 | global batch size: 512 | lm loss: 4.278838E+00 | grad norm: 0.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.745 | TFLOPs: 23.55 | 63: iteration 510/ 24424 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 2.23 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 4.248718E+00 | grad norm: 1.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.854 | TFLOPs: 23.66 | 63: iteration 520/ 24424 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 2.24 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 4.254750E+00 | grad norm: 1.116 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.724 | TFLOPs: 23.55 | 63: iteration 530/ 24424 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 2.27 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 4.153243E+00 | grad norm: 0.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.608 | TFLOPs: 23.23 | 63: iteration 540/ 24424 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 2.24 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 4.002406E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.466 | TFLOPs: 23.52 | 63: iteration 550/ 24424 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 2.23 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.970443E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.224 | TFLOPs: 23.60 | 63: iteration 560/ 24424 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 2.25 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.898996E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.457 | TFLOPs: 23.42 | 63: iteration 570/ 24424 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 2.25 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.820966E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.976 | TFLOPs: 23.47 | 63: iteration 580/ 24424 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 2.27 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.799116E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.325 | TFLOPs: 23.20 | 63: iteration 590/ 24424 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 2.26 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.751442E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.012 | TFLOPs: 23.37 | 63: iteration 600/ 24424 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 2.25 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.713437E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.502 | TFLOPs: 23.42 | 63: iteration 610/ 24424 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 2.25 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.642234E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.023 | TFLOPs: 23.47 | 63: iteration 620/ 24424 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 2.31 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.627555E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.988 | TFLOPs: 22.85 | 63: iteration 630/ 24424 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 2.25 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.550185E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.547 | TFLOPs: 23.42 | 63: iteration 640/ 24424 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 2.23 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.573977E+00 | grad norm: 0.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.179 | TFLOPs: 23.59 | 63: iteration 650/ 24424 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 2.24 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.553503E+00 | grad norm: 0.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.492 | TFLOPs: 23.52 | 63: iteration 660/ 24424 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 2.29 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.597907E+00 | grad norm: 0.938 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.449 | TFLOPs: 23.00 | 63: iteration 670/ 24424 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 2.24 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.530952E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.735 | TFLOPs: 23.55 | 63: iteration 680/ 24424 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 2.28 | learning rate: 1.999E-04 | global batch size: 512 | lm loss: 3.439986E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.306 | TFLOPs: 23.09 | 63: iteration 690/ 24424 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.420309E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.275 | TFLOPs: 23.60 | 63: iteration 700/ 24424 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.408589E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.047 | TFLOPs: 23.68 | 63: iteration 710/ 24424 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 2.26 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.359255E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.774 | TFLOPs: 23.35 | 63: iteration 720/ 24424 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 2.25 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.343074E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.782 | TFLOPs: 23.45 | 63: iteration 730/ 24424 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.342560E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.582 | TFLOPs: 23.63 | 63: iteration 740/ 24424 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.307619E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.844 | TFLOPs: 23.66 | 63: iteration 750/ 24424 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.321491E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.783 | TFLOPs: 23.66 | 63: iteration 760/ 24424 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.284094E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.179 | TFLOPs: 23.59 | 63: iteration 770/ 24424 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.278476E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.547 | TFLOPs: 23.63 | 63: iteration 780/ 24424 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.229389E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.250 | TFLOPs: 23.60 | 63: iteration 790/ 24424 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 2.24 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.219941E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.063 | TFLOPs: 23.58 | 63: iteration 800/ 24424 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 2.30 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.213705E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.718 | TFLOPs: 22.93 | 63: iteration 810/ 24424 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 2.23 | learning rate: 1.998E-04 | global batch size: 512 | lm loss: 3.174785E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.040 | TFLOPs: 23.68 | 63: iteration 820/ 24424 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 2.23 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.157965E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.502 | TFLOPs: 23.63 | 63: iteration 830/ 24424 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 2.24 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.157067E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.295 | TFLOPs: 23.50 | 63: iteration 840/ 24424 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 2.23 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.136847E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.621 | TFLOPs: 23.64 | 63: iteration 850/ 24424 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 2.26 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.185075E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.688 | TFLOPs: 23.34 | 63: iteration 860/ 24424 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 2.23 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.133755E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.850 | TFLOPs: 23.66 | 63: iteration 870/ 24424 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 2.25 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.131985E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.380 | TFLOPs: 23.41 | 63: iteration 880/ 24424 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 2.25 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.113045E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.050 | TFLOPs: 23.48 | 63: iteration 890/ 24424 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 2.28 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.077236E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.466 | TFLOPs: 23.11 | 63: iteration 900/ 24424 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 2.23 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.089888E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.839 | TFLOPs: 23.66 | 63: iteration 910/ 24424 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 2.23 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.073868E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.748 | TFLOPs: 23.65 | 63: iteration 920/ 24424 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 2.24 | learning rate: 1.997E-04 | global batch size: 512 | lm loss: 3.039638E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.210 | TFLOPs: 23.49 | 63: iteration 930/ 24424 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 2.27 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 3.018416E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.229 | TFLOPs: 23.19 | 63: iteration 940/ 24424 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 2.27 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 2.997235E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.786 | TFLOPs: 23.24 | 63: iteration 950/ 24424 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 2.24 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 3.004116E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.767 | TFLOPs: 23.55 | 63: iteration 960/ 24424 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 2.23 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 3.002920E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.614 | TFLOPs: 23.64 | 63: iteration 970/ 24424 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 2.25 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 2.987174E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.466 | TFLOPs: 23.42 | 63: iteration 980/ 24424 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 2.26 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 2.984398E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.780 | TFLOPs: 23.35 | 63: iteration 990/ 24424 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 2.28 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 3.003063E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.491 | TFLOPs: 23.11 | 63: iteration 1000/ 24424 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 2.23 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 3.001862E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.714 | TFLOPs: 23.65 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 1000 | lm loss value: 2.906823E+00 | lm loss PPL: 1.829857E+01 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 1000 to checkpoints_3b9 0: [2022-11-25 17:40:04,009] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! 0: [2022-11-25 17:40:04,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_01-model_00-model_states.pt... 32: [2022-11-25 17:40:04,035] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_21-model_00-model_states.pt... 32: [2022-11-25 17:40:04,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_21-model_00-model_states.pt. 32: [2022-11-25 17:40:04,329] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_22-model_00-model_states.pt... 0: [2022-11-25 17:40:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_01-model_00-model_states.pt. 0: [2022-11-25 17:40:04,402] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_03-model_00-model_states.pt... 32: [2022-11-25 17:40:04,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_22-model_00-model_states.pt. 32: [2022-11-25 17:40:04,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_23-model_00-model_states.pt... 0: [2022-11-25 17:40:04,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_03-model_00-model_states.pt. 0: [2022-11-25 17:40:04,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_04-model_00-model_states.pt... 32: [2022-11-25 17:40:04,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_23-model_00-model_states.pt. 32: [2022-11-25 17:40:04,786] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_24-model_00-model_states.pt... 0: [2022-11-25 17:40:04,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_04-model_00-model_states.pt. 0: [2022-11-25 17:40:04,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_05-model_00-model_states.pt... 32: [2022-11-25 17:40:05,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_24-model_00-model_states.pt. 32: [2022-11-25 17:40:05,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_25-model_00-model_states.pt... 0: [2022-11-25 17:40:05,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_05-model_00-model_states.pt. 0: [2022-11-25 17:40:05,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_06-model_00-model_states.pt... 32: [2022-11-25 17:40:05,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_25-model_00-model_states.pt. 32: [2022-11-25 17:40:05,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_26-model_00-model_states.pt... 0: [2022-11-25 17:40:05,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_06-model_00-model_states.pt. 0: [2022-11-25 17:40:05,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_07-model_00-model_states.pt... 0: [2022-11-25 17:40:05,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_07-model_00-model_states.pt. 0: [2022-11-25 17:40:05,533] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_08-model_00-model_states.pt... 32: [2022-11-25 17:40:05,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_26-model_00-model_states.pt. 32: [2022-11-25 17:40:05,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_27-model_00-model_states.pt... 0: [2022-11-25 17:40:05,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_08-model_00-model_states.pt. 0: [2022-11-25 17:40:05,758] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_09-model_00-model_states.pt... 32: [2022-11-25 17:40:05,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_27-model_00-model_states.pt. 32: [2022-11-25 17:40:05,767] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_28-model_00-model_states.pt... 0: [2022-11-25 17:40:05,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_09-model_00-model_states.pt. 0: [2022-11-25 17:40:05,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_10-model_00-model_states.pt... 32: [2022-11-25 17:40:06,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_28-model_00-model_states.pt. 32: [2022-11-25 17:40:06,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_29-model_00-model_states.pt... 0: [2022-11-25 17:40:06,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_10-model_00-model_states.pt. 0: [2022-11-25 17:40:06,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_11-model_00-model_states.pt... 32: [2022-11-25 17:40:06,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_29-model_00-model_states.pt. 32: [2022-11-25 17:40:06,298] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_30-model_00-model_states.pt... 0: [2022-11-25 17:40:06,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_11-model_00-model_states.pt. 0: [2022-11-25 17:40:06,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_12-model_00-model_states.pt... 32: [2022-11-25 17:40:06,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_30-model_00-model_states.pt. 32: [2022-11-25 17:40:06,542] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_31-model_00-model_states.pt... 0: [2022-11-25 17:40:06,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_12-model_00-model_states.pt. 0: [2022-11-25 17:40:06,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_13-model_00-model_states.pt... 32: [2022-11-25 17:40:06,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_31-model_00-model_states.pt. 32: [2022-11-25 17:40:06,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_32-model_00-model_states.pt... 0: [2022-11-25 17:40:06,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_13-model_00-model_states.pt. 0: [2022-11-25 17:40:06,871] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_14-model_00-model_states.pt... 32: [2022-11-25 17:40:07,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_32-model_00-model_states.pt. 32: [2022-11-25 17:40:07,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_33-model_00-model_states.pt... 0: [2022-11-25 17:40:07,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_14-model_00-model_states.pt. 0: [2022-11-25 17:40:07,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_15-model_00-model_states.pt... 32: [2022-11-25 17:40:07,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_33-model_00-model_states.pt. 32: [2022-11-25 17:40:07,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_34-model_00-model_states.pt... 0: [2022-11-25 17:40:07,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_15-model_00-model_states.pt. 0: [2022-11-25 17:40:07,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_16-model_00-model_states.pt... 32: [2022-11-25 17:40:07,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_34-model_00-model_states.pt. 32: [2022-11-25 17:40:07,521] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_35-model_00-model_states.pt... 0: [2022-11-25 17:40:07,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_16-model_00-model_states.pt. 0: [2022-11-25 17:40:07,544] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_17-model_00-model_states.pt... 32: [2022-11-25 17:40:07,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_35-model_00-model_states.pt. 32: [2022-11-25 17:40:07,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_36-model_00-model_states.pt... 0: [2022-11-25 17:40:07,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_17-model_00-model_states.pt. 0: [2022-11-25 17:40:07,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_18-model_00-model_states.pt... 0: [2022-11-25 17:40:07,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_18-model_00-model_states.pt. 0: [2022-11-25 17:40:07,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_19-model_00-model_states.pt... 32: [2022-11-25 17:40:07,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_36-model_00-model_states.pt. 32: [2022-11-25 17:40:07,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_37-model_00-model_states.pt... 0: [2022-11-25 17:40:08,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_19-model_00-model_states.pt. 0: [2022-11-25 17:40:08,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_20-model_00-model_states.pt... 32: [2022-11-25 17:40:08,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_37-model_00-model_states.pt. 32: [2022-11-25 17:40:08,223] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_38-model_00-model_states.pt... 0: [2022-11-25 17:40:08,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_20-model_00-model_states.pt. 0: [2022-11-25 17:40:08,426] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step1000/mp_rank_00_model_states.pt 0: [2022-11-25 17:40:08,426] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/mp_rank_00_model_states.pt... 0: [2022-11-25 17:40:08,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/mp_rank_00_model_states.pt. 32: [2022-11-25 17:40:08,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_38-model_00-model_states.pt. 32: [2022-11-25 17:40:08,441] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/layer_40-model_00-model_states.pt... 32: [2022-11-25 17:40:08,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/layer_40-model_00-model_states.pt. 32: [2022-11-25 17:40:08,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/mp_rank_01_model_states.pt... 32: [2022-11-25 17:40:08,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/mp_rank_01_model_states.pt. 0: [2022-11-25 17:40:08,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:40:08,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 55: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 52: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 16: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 40: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 48: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 45: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 33: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 3: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 41: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 15: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:40:08,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:40:08,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:40:08,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-25 17:40:08,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 32: [2022-11-25 17:40:08,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 23: [2022-11-25 17:40:08,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 32: [2022-11-25 17:40:08,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 17:40:08,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 17:40:08,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:40:08,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 17:40:08,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:08,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:08,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 23: [2022-11-25 17:40:08,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 51: [2022-11-25 17:40:08,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-25 17:40:08,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 17:40:08,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:08,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:40:08,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-25 17:40:08,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:40:08,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:40:08,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 17:40:08,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 17:40:08,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 5: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 32: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:40:08,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:08,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:08,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:08,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:08,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 17:40:08,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:08,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:08,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:08,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:08,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:08,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:08,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:08,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:08,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:08,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:40:08,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:40:08,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:40:08,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 9: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 63: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 17:40:08,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:08,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:08,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:08,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 39: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 16: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 39: [2022-11-25 17:40:08,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 14: [2022-11-25 17:40:08,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:08,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:08,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:08,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:08,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:08,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:08,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:08,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:08,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:08,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:08,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:08,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:08,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:08,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 17:40:08,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:40:08,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 17:40:08,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:08,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:40:08,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-25 17:40:08,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:40:08,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 55: [2022-11-25 17:40:08,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 23: [2022-11-25 17:40:08,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:08,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 17:40:08,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:08,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:08,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:08,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:08,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:08,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:08,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:08,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:08,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:08,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:08,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:08,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:40:08,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:08,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 9: [2022-11-25 17:40:08,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:08,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:08,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:40:08,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 17:40:08,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:08,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:08,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 47: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:40:08,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 26: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:40:08,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 32: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 17:40:08,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 6: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:40:08,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 32: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:08,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:08,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:08,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:08,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:08,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:08,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:40:08,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 58: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 10: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:08,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 6: [2022-11-25 17:40:08,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:08,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:08,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:08,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 1: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 37: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:40:08,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:08,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:08,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:40:08,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 17:40:08,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 11: [2022-11-25 17:40:08,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 51: [2022-11-25 17:40:08,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 11: [2022-11-25 17:40:08,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 51: [2022-11-25 17:40:08,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 11: [2022-11-25 17:40:08,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 12: [2022-11-25 17:40:08,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:08,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:08,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:08,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:08,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:08,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:08,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:08,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:08,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:08,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:08,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:08,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:08,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:40:08,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-25 17:40:08,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:08,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 34: [2022-11-25 17:40:08,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:08,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:08,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:08,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:08,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 17:40:08,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:08,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:08,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:08,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:40:08,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 17:40:08,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:08,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 17:40:08,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 17:40:08,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 17:40:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-25 17:40:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:08,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:08,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:08,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:40:08,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:08,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 42: [2022-11-25 17:40:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:40:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 17:40:08,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 17:40:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:40:08,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 36: [2022-11-25 17:40:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 27: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 36: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 42: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:40:08,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 17:40:08,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:08,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:40:08,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:08,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:08,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-25 17:40:08,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:08,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:08,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:08,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:08,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:08,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 50: [2022-11-25 17:40:08,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:08,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:08,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:08,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 19: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 50: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:08,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 17:40:08,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:08,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:08,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:08,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:08,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:08,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:08,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 17:40:08,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 17:40:08,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:08,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:08,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:08,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:08,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:08,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:08,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:08,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:08,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:08,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 17:40:08,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:08,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:08,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:08,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:08,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:08,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:08,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:08,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:08,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:08,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:08,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:08,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:08,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:08,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:08,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 17:40:08,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:08,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 51: [2022-11-25 17:40:08,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 34: [2022-11-25 17:40:08,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:08,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:08,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:08,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:08,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:08,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:08,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:08,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:08,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:40:08,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 17:40:08,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:08,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:08,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:40:08,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:08,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:08,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 62: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:08,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 21: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 17:40:08,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:08,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:08,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:08,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:08,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:08,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:08,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:08,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:08,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:08,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 17:40:08,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:08,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:08,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:08,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:08,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:08,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:08,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:40:08,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 17:40:08,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-25 17:40:08,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:08,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:08,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:08,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:08,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:08,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:40:08,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:08,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:08,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:08,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:08,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:08,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:08,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:08,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:08,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:08,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:08,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:40:08,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 17:40:08,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:08,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:08,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:08,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:08,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:08,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:08,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:40:08,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-25 17:40:08,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:40:08,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:08,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:08,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:08,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:08,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:08,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:08,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:08,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:08,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:08,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:08,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:08,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:08,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:08,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:40:08,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:08,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:08,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:08,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 17:40:08,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:08,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:08,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 17:40:08,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:08,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:08,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:08,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:08,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:08,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 17:40:08,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:08,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:08,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:08,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:08,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:08,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:08,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:08,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:08,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:08,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:40:08,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-25 17:40:08,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:08,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 17:40:08,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:08,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:08,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:08,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:08,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:40:08,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:08,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:08,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:08,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:08,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:08,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:40:08,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:08,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:08,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:40:08,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:08,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:08,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:08,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:08,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:08,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:08,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:08,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:40:08,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-25 17:40:08,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:08,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 4: [2022-11-25 17:40:08,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 17:40:08,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:08,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:08,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:08,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:08,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:08,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:08,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:08,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:40:08,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:08,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 61: [2022-11-25 17:40:08,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:40:08,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 17:40:08,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:08,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:08,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:08,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:08,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:08,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:08,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:08,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:08,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 17:40:08,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:08,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:08,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:08,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:08,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 34: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 15: [2022-11-25 17:40:08,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:08,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:08,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:08,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:08,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:08,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-25 17:40:08,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:08,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:08,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:08,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:08,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:08,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 55: [2022-11-25 17:40:08,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:08,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:40:08,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 62: [2022-11-25 17:40:08,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 8: [2022-11-25 17:40:08,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 55: [2022-11-25 17:40:08,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 17:40:08,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:08,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:08,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:08,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:08,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:08,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:08,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:08,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 17:40:08,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:08,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:08,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:08,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:08,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:08,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:08,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:08,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 17:40:08,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:08,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:08,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:08,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:08,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:08,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 17:40:08,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:08,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:08,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:08,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:08,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:40:08,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 42: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:40:08,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 42: [2022-11-25 17:40:08,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 32: [2022-11-25 17:40:08,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:08,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:08,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:08,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 17:40:08,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:08,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:08,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:08,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:08,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:08,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:08,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:08,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-25 17:40:08,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 10: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 54: [2022-11-25 17:40:08,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 10: [2022-11-25 17:40:08,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 17:40:08,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:08,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 17:40:08,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:40:08,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 17:40:08,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:08,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:08,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:08,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 17:40:08,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:40:08,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:08,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:08,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:08,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:08,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:08,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 17:40:08,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:08,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:08,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:08,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:08,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:40:08,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:08,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:08,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:08,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 48: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 26: [2022-11-25 17:40:08,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:08,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:08,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:08,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:08,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:08,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:08,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 27: [2022-11-25 17:40:08,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 56: [2022-11-25 17:40:08,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:08,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:08,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 5: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 47: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:40:08,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:08,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 5: [2022-11-25 17:40:08,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 47: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:08,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:08,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:08,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:08,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:08,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:08,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:08,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:08,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:08,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 17:40:08,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:08,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-25 17:40:08,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:08,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:08,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:08,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:08,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:08,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:08,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:08,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:08,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 17:40:08,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:08,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:08,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:08,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:40:08,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:40:08,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 17:40:08,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:08,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:08,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:40:08,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 17:40:08,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 17:40:08,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 17:40:08,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 17:40:08,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:08,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:08,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:08,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:08,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 14: [2022-11-25 17:40:08,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 59: [2022-11-25 17:40:08,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:08,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:08,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:08,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:08,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:08,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:08,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:08,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:40:08,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:08,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:08,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 17:40:08,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:08,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:08,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 17:40:08,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:08,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:08,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:08,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:08,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:08,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:08,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:08,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:08,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:08,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:08,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:08,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:08,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:08,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:08,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:08,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:08,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:08,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:08,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:08,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:08,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:08,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:08,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:08,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:08,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:08,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 17:40:08,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:08,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:08,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 17:40:08,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:40:08,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 17:40:08,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:08,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:40:08,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:40:08,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:08,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:08,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:08,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:08,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:08,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:08,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:40:08,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 17:40:08,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 17:40:08,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 17:40:08,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 17:40:08,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 17:40:08,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 17:40:08,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:40:08,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-25 17:40:08,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:08,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-25 17:40:08,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:08,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:08,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:08,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 17:40:08,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:08,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:40:08,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:08,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:08,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:08,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:08,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:08,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:08,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:08,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:08,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:08,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:08,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 17:40:08,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:40:08,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 17:40:08,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:08,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:08,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:08,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:08,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:08,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:08,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 63: [2022-11-25 17:40:08,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 9: [2022-11-25 17:40:08,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 63: [2022-11-25 17:40:08,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:08,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 17:40:08,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:08,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 17:40:08,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:08,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 17:40:08,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:40:08,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 17:40:08,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:08,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:08,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:08,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:08,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:40:08,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:08,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:08,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:08,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:08,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:08,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:08,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:08,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:08,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:08,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:08,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:08,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:08,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-25 17:40:08,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 17:40:08,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:40:08,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 17:40:08,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:08,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 29: [2022-11-25 17:40:08,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 52: [2022-11-25 17:40:08,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:08,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 17:40:08,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 17:40:08,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-25 17:40:08,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 17:40:08,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:40:08,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-25 17:40:08,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:08,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:08,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 17:40:08,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:08,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 33: [2022-11-25 17:40:08,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:40:08,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 17:40:08,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 17:40:08,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 17:40:08,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:08,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:40:08,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:08,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:08,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 17:40:08,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:08,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 62: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 23: [2022-11-25 17:40:08,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 62: [2022-11-25 17:40:08,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 23: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:08,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 6: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 47: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:08,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:08,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:08,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:08,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:08,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:08,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:08,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:08,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:08,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:08,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:08,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:08,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 43: [2022-11-25 17:40:08,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 27: [2022-11-25 17:40:08,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 43: [2022-11-25 17:40:08,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 27: [2022-11-25 17:40:08,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:08,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:08,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:08,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:08,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:08,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:08,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:40:08,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 17:40:08,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:08,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:40:08,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 56: [2022-11-25 17:40:08,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:40:08,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:08,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:08,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:08,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:08,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:08,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:08,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:08,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:08,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:08,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:08,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:08,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:08,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:08,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:08,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:08,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:09,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:09,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:09,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:09,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:09,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:09,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:09,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:09,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:09,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:09,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:09,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 17:40:09,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:09,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-25 17:40:09,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:09,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:09,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:09,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:09,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:09,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:09,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:09,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:09,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:09,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:09,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:09,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:09,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:09,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:09,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:09,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:40:09,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:09,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:09,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:09,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:09,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 3: [2022-11-25 17:40:09,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 42: [2022-11-25 17:40:09,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:09,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:09,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:09,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:09,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:09,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:09,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:09,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:40:09,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:09,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:09,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:09,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:09,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:09,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:09,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 52: [2022-11-25 17:40:09,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:09,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:09,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:09,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:09,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:40:09,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:09,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:09,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:40:09,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:09,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:09,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:09,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:09,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:09,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:40:09,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:09,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:09,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:09,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:09,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:09,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:40:09,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:09,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:09,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:09,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:09,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:09,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:09,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:09,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:09,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:09,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 17:40:09,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:09,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 17:40:09,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:09,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:09,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:09,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:09,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:09,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 17:40:09,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:09,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:09,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:40:09,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 17:40:09,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:09,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:09,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:09,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:09,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:09,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:09,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-25 17:40:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 17:40:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:09,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 17:40:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:09,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:09,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 17:40:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:09,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 17:40:09,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 45: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 5: [2022-11-25 17:40:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 45: [2022-11-25 17:40:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 18: [2022-11-25 17:40:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 5: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:40:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 17:40:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:40:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 17:40:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:09,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:40:09,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:40:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 62: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 19: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 62: [2022-11-25 17:40:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:09,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:09,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:09,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:09,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:09,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:09,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:09,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:09,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:09,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:09,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:09,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 17:40:09,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:09,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:40:09,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:09,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:09,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:09,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:09,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:09,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:40:09,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 17:40:09,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:09,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:09,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 17:40:09,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:09,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:09,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:09,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:09,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:09,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:09,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:09,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:09,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:09,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:09,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:40:09,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:09,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:09,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 17:40:09,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:09,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:09,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:09,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:09,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:09,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 17:40:09,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:40:09,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:09,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:09,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 17:40:09,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:40:09,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 17:40:09,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:09,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:09,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:09,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:09,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:09,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:09,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 24: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 59: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 24: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 5: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 50: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 53: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 1: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 50: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 5: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 1: [2022-11-25 17:40:09,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 53: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 17:40:09,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 17:40:09,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 40: [2022-11-25 17:40:09,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 2: [2022-11-25 17:40:09,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 17:40:09,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:09,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:09,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 17:40:09,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 55: [2022-11-25 17:40:09,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 6: [2022-11-25 17:40:09,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 55: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 6: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 49: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 34: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 4: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 49: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 34: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 17:40:09,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 17:40:09,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 17:40:09,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:09,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:40:09,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-25 17:40:09,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 17:40:09,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:40:09,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 17:40:09,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 17:40:09,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 17:40:09,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 17:40:09,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 17:40:09,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 17:40:09,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 17:40:09,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 17:40:09,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 61: [2022-11-25 17:40:09,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:40:09,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 61: [2022-11-25 17:40:09,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 47: [2022-11-25 17:40:09,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 17:40:09,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 17:40:09,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:40:09,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 17:40:09,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 17:40:09,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:40:09,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 17:40:09,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 17:40:09,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-25 17:40:09,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 17:40:09,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 17:40:09,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 36: [2022-11-25 17:40:09,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 3: [2022-11-25 17:40:09,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 36: [2022-11-25 17:40:09,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 3: [2022-11-25 17:40:09,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 17:40:09,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:09,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:40:09,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:09,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 17:40:09,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 17:40:09,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 17:40:09,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:09,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:09,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:09,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 17:40:09,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:40:09,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 17:40:09,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 17:40:09,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:40:09,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 17:40:09,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 17:40:09,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:40:09,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 17:40:09,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 17:40:09,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 54: [2022-11-25 17:40:09,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 12: [2022-11-25 17:40:09,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 17:40:09,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 17:40:09,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-25 17:40:09,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 17:40:09,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 17:40:09,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 17:40:09,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 17:40:09,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:40:09,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 17:40:09,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 17:40:09,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 17:40:09,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 17:40:09,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 17:40:09,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:40:09,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 17:40:09,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 17:40:09,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 40: [2022-11-25 17:40:09,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 58: [2022-11-25 17:40:09,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 17:40:09,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 17:40:09,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 17:40:09,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 17:40:09,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-25 17:40:09,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-25 17:40:09,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 17:40:09,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 17:40:09,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 17:40:09,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 17:40:09,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:40:09,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 17:40:09,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 17:40:09,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 17:40:09,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 17:40:09,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 17:40:09,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:40:09,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 17:40:09,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 17:40:09,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:40:09,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 17:40:09,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 17:40:09,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:40:09,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-25 17:40:09,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 17:40:09,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:40:09,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 17:40:09,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 17:40:09,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-25 17:40:09,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 17:40:09,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 17:40:09,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:40:09,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 17:40:09,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 17:40:09,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 17:40:09,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 17:40:09,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 17:40:09,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 17:40:09,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 17:40:09,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 17:40:09,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-25 17:40:09,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 17:40:09,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 17:40:09,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-25 17:40:09,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 17:40:09,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 17:40:09,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:40:09,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step1000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 17:40:09,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: successfully saved checkpoint at iteration 1000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5182.38 63: iteration 1010/ 24424 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 2.86 | learning rate: 1.996E-04 | global batch size: 512 | lm loss: 3.027368E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.240 | TFLOPs: 18.45 | 63: iteration 1020/ 24424 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 2.26 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 3.019261E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.823 | TFLOPs: 23.35 | 63: iteration 1030/ 24424 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 2.28 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 3.436409E+00 | grad norm: 3.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.188 | TFLOPs: 23.08 | 63: iteration 1040/ 24424 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 2.31 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 4.303681E+00 | grad norm: 2.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.434 | TFLOPs: 22.80 | 63: iteration 1050/ 24424 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 2.28 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 4.323696E+00 | grad norm: 1.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.533 | TFLOPs: 23.11 | 63: iteration 1060/ 24424 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 2.30 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 3.824686E+00 | grad norm: 1.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.066 | TFLOPs: 22.96 | 63: iteration 1070/ 24424 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 2.24 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 3.569777E+00 | grad norm: 0.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.272 | TFLOPs: 23.50 | 63: iteration 1080/ 24424 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 2.25 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 3.320217E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.864 | TFLOPs: 23.46 | 63: iteration 1090/ 24424 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 2.23 | learning rate: 1.995E-04 | global batch size: 512 | lm loss: 3.222166E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.579 | TFLOPs: 23.63 | 63: iteration 1100/ 24424 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 2.23 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 3.142260E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.879 | TFLOPs: 23.66 | 63: iteration 1110/ 24424 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 2.26 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 3.135940E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.253 | TFLOPs: 23.29 | 63: iteration 1120/ 24424 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 2.25 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 3.082256E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.270 | TFLOPs: 23.40 | 63: iteration 1130/ 24424 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 2.25 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 3.019249E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.884 | TFLOPs: 23.46 | 63: iteration 1140/ 24424 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 2.25 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 2.992118E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.408 | TFLOPs: 23.41 | 63: iteration 1150/ 24424 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 2.23 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 3.006853E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.754 | TFLOPs: 23.65 | 63: iteration 1160/ 24424 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 2.23 | learning rate: 1.994E-04 | global batch size: 512 | lm loss: 2.987839E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.797 | TFLOPs: 23.66 | 63: iteration 1170/ 24424 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 2.23 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.915365E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.745 | TFLOPs: 23.65 | 63: iteration 1180/ 24424 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 2.27 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.958263E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.793 | TFLOPs: 23.24 | 63: iteration 1190/ 24424 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 2.24 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.946287E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.965 | TFLOPs: 23.57 | 63: iteration 1200/ 24424 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 2.25 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.934964E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.026 | TFLOPs: 23.47 | 63: iteration 1210/ 24424 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 2.24 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.909301E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.073 | TFLOPs: 23.58 | 63: iteration 1220/ 24424 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 2.26 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.890003E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.165 | TFLOPs: 23.28 | 63: iteration 1230/ 24424 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 2.23 | learning rate: 1.993E-04 | global batch size: 512 | lm loss: 2.904157E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.261 | TFLOPs: 23.60 | 63: iteration 1240/ 24424 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 2.23 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.870628E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.676 | TFLOPs: 23.64 | 63: iteration 1250/ 24424 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 2.26 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.865262E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.958 | TFLOPs: 23.36 | 63: iteration 1260/ 24424 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 2.25 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.868543E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.710 | TFLOPs: 23.44 | 63: iteration 1270/ 24424 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 2.23 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.848913E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.255 | TFLOPs: 23.60 | 63: iteration 1280/ 24424 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 2.23 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.821249E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.526 | TFLOPs: 23.63 | 63: iteration 1290/ 24424 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 2.25 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.839581E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.379 | TFLOPs: 23.41 | 63: iteration 1300/ 24424 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 2.25 | learning rate: 1.992E-04 | global batch size: 512 | lm loss: 2.835756E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.992 | TFLOPs: 23.47 | 63: iteration 1310/ 24424 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 2.23 | learning rate: 1.991E-04 | global batch size: 512 | lm loss: 2.847941E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.015 | TFLOPs: 23.68 | 63: iteration 1320/ 24424 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 2.25 | learning rate: 1.991E-04 | global batch size: 512 | lm loss: 2.847007E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.669 | TFLOPs: 23.44 | 63: iteration 1330/ 24424 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 2.25 | learning rate: 1.991E-04 | global batch size: 512 | lm loss: 2.820989E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.419 | TFLOPs: 23.41 | 63: iteration 1340/ 24424 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 2.23 | learning rate: 1.991E-04 | global batch size: 512 | lm loss: 2.797473E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.713 | TFLOPs: 23.65 | 63: iteration 1350/ 24424 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 2.26 | learning rate: 1.991E-04 | global batch size: 512 | lm loss: 2.807513E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.025 | TFLOPs: 23.37 | 63: iteration 1360/ 24424 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 2.26 | learning rate: 1.991E-04 | global batch size: 512 | lm loss: 2.799786E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.625 | TFLOPs: 23.33 | 63: iteration 1370/ 24424 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 2.26 | learning rate: 1.990E-04 | global batch size: 512 | lm loss: 2.773254E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.961 | TFLOPs: 23.36 | 63: iteration 1380/ 24424 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 2.27 | learning rate: 1.990E-04 | global batch size: 512 | lm loss: 2.774832E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.833 | TFLOPs: 23.25 | 63: iteration 1390/ 24424 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 2.27 | learning rate: 1.990E-04 | global batch size: 512 | lm loss: 2.798199E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.909 | TFLOPs: 23.26 | 63: iteration 1400/ 24424 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 2.24 | learning rate: 1.990E-04 | global batch size: 512 | lm loss: 2.814790E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.836 | TFLOPs: 23.56 | 63: iteration 1410/ 24424 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 2.23 | learning rate: 1.990E-04 | global batch size: 512 | lm loss: 2.780122E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.407 | TFLOPs: 23.62 | 63: iteration 1420/ 24424 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 2.26 | learning rate: 1.990E-04 | global batch size: 512 | lm loss: 2.784196E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.800 | TFLOPs: 23.35 | 63: iteration 1430/ 24424 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 2.23 | learning rate: 1.989E-04 | global batch size: 512 | lm loss: 2.778684E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.439 | TFLOPs: 23.62 | 63: iteration 1440/ 24424 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 2.25 | learning rate: 1.989E-04 | global batch size: 512 | lm loss: 2.778806E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.039 | TFLOPs: 23.48 | 63: iteration 1450/ 24424 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 2.24 | learning rate: 1.989E-04 | global batch size: 512 | lm loss: 2.755775E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.129 | TFLOPs: 23.48 | 63: iteration 1460/ 24424 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 2.26 | learning rate: 1.989E-04 | global batch size: 512 | lm loss: 2.743578E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.584 | TFLOPs: 23.33 | 63: iteration 1470/ 24424 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 2.25 | learning rate: 1.989E-04 | global batch size: 512 | lm loss: 2.722883E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.429 | TFLOPs: 23.41 | 63: iteration 1480/ 24424 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 2.23 | learning rate: 1.988E-04 | global batch size: 512 | lm loss: 2.746241E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.792 | TFLOPs: 23.66 | 63: iteration 1490/ 24424 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 2.23 | learning rate: 1.988E-04 | global batch size: 512 | lm loss: 2.763607E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.105 | TFLOPs: 23.59 | 63: iteration 1500/ 24424 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 2.23 | learning rate: 1.988E-04 | global batch size: 512 | lm loss: 2.741699E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.977 | TFLOPs: 23.67 | 63: iteration 1510/ 24424 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 2.23 | learning rate: 1.988E-04 | global batch size: 512 | lm loss: 2.730465E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.513 | TFLOPs: 23.63 | 63: iteration 1520/ 24424 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 2.23 | learning rate: 1.988E-04 | global batch size: 512 | lm loss: 2.713260E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.742 | TFLOPs: 23.65 | 63: iteration 1530/ 24424 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 2.24 | learning rate: 1.987E-04 | global batch size: 512 | lm loss: 2.745312E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.804 | TFLOPs: 23.55 | 63: iteration 1540/ 24424 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 2.23 | learning rate: 1.987E-04 | global batch size: 512 | lm loss: 2.738276E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.504 | TFLOPs: 23.63 | 63: iteration 1550/ 24424 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 2.25 | learning rate: 1.987E-04 | global batch size: 512 | lm loss: 2.736626E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.206 | TFLOPs: 23.39 | 63: iteration 1560/ 24424 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 2.25 | learning rate: 1.987E-04 | global batch size: 512 | lm loss: 2.705065E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.949 | TFLOPs: 23.47 | 63: iteration 1570/ 24424 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 2.26 | learning rate: 1.987E-04 | global batch size: 512 | lm loss: 2.703664E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.365 | TFLOPs: 23.30 | 63: iteration 1580/ 24424 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 2.23 | learning rate: 1.986E-04 | global batch size: 512 | lm loss: 2.705077E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.889 | TFLOPs: 23.67 | 63: iteration 1590/ 24424 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 2.24 | learning rate: 1.986E-04 | global batch size: 512 | lm loss: 2.705332E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.582 | TFLOPs: 23.53 | 63: iteration 1600/ 24424 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 2.27 | learning rate: 1.986E-04 | global batch size: 512 | lm loss: 2.670523E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.662 | TFLOPs: 23.23 | 63: iteration 1610/ 24424 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 2.23 | learning rate: 1.986E-04 | global batch size: 512 | lm loss: 2.700950E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.304 | TFLOPs: 23.61 | 63: iteration 1620/ 24424 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 2.24 | learning rate: 1.986E-04 | global batch size: 512 | lm loss: 2.692284E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.450 | TFLOPs: 23.52 | 63: iteration 1630/ 24424 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 2.27 | learning rate: 1.985E-04 | global batch size: 512 | lm loss: 3.066188E+00 | grad norm: 6.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.246 | TFLOPs: 23.19 | 63: iteration 1640/ 24424 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 2.27 | learning rate: 1.985E-04 | global batch size: 512 | lm loss: 3.580364E+00 | grad norm: 2.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.784 | TFLOPs: 23.24 | 63: iteration 1650/ 24424 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 2.27 | learning rate: 1.985E-04 | global batch size: 512 | lm loss: 3.587982E+00 | grad norm: 2.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.799 | TFLOPs: 23.24 | 63: iteration 1660/ 24424 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 2.25 | learning rate: 1.985E-04 | global batch size: 512 | lm loss: 3.447050E+00 | grad norm: 1.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.000 | TFLOPs: 23.47 | 63: iteration 1670/ 24424 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 2.28 | learning rate: 1.985E-04 | global batch size: 512 | lm loss: 3.227366E+00 | grad norm: 0.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.926 | TFLOPs: 23.16 | 63: iteration 1680/ 24424 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 2.33 | learning rate: 1.984E-04 | global batch size: 512 | lm loss: 3.008277E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.155 | TFLOPs: 22.66 | 63: iteration 1690/ 24424 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 2.30 | learning rate: 1.984E-04 | global batch size: 512 | lm loss: 2.897125E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.932 | TFLOPs: 22.95 | 63: iteration 1700/ 24424 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 2.23 | learning rate: 1.984E-04 | global batch size: 512 | lm loss: 2.810868E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.526 | TFLOPs: 23.63 | 63: iteration 1710/ 24424 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 2.26 | learning rate: 1.984E-04 | global batch size: 512 | lm loss: 2.814397E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.262 | TFLOPs: 23.29 | 63: iteration 1720/ 24424 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 2.25 | learning rate: 1.984E-04 | global batch size: 512 | lm loss: 2.727725E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.839 | TFLOPs: 23.45 | 63: iteration 1730/ 24424 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 2.27 | learning rate: 1.983E-04 | global batch size: 512 | lm loss: 2.752745E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.541 | TFLOPs: 23.22 | 63: iteration 1740/ 24424 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 2.24 | learning rate: 1.983E-04 | global batch size: 512 | lm loss: 2.749708E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.502 | TFLOPs: 23.52 | 63: iteration 1750/ 24424 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 2.26 | learning rate: 1.983E-04 | global batch size: 512 | lm loss: 2.705905E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.124 | TFLOPs: 23.28 | 63: iteration 1760/ 24424 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 2.23 | learning rate: 1.983E-04 | global batch size: 512 | lm loss: 2.702267E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.226 | TFLOPs: 23.60 | 63: iteration 1770/ 24424 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 2.25 | learning rate: 1.982E-04 | global batch size: 512 | lm loss: 2.707509E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.066 | TFLOPs: 23.38 | 63: iteration 1780/ 24424 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 2.23 | learning rate: 1.982E-04 | global batch size: 512 | lm loss: 2.700995E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.185 | TFLOPs: 23.59 | 63: iteration 1790/ 24424 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 2.23 | learning rate: 1.982E-04 | global batch size: 512 | lm loss: 2.704451E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.635 | TFLOPs: 23.64 | 63: iteration 1800/ 24424 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 2.23 | learning rate: 1.982E-04 | global batch size: 512 | lm loss: 2.670743E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.778 | TFLOPs: 23.65 | 63: iteration 1810/ 24424 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 2.23 | learning rate: 1.981E-04 | global batch size: 512 | lm loss: 2.681067E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.741 | TFLOPs: 23.65 | 63: iteration 1820/ 24424 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 2.30 | learning rate: 1.981E-04 | global batch size: 512 | lm loss: 2.697255E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.396 | TFLOPs: 22.89 | 63: iteration 1830/ 24424 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 2.25 | learning rate: 1.981E-04 | global batch size: 512 | lm loss: 2.660093E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.682 | TFLOPs: 23.44 | 63: iteration 1840/ 24424 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 2.25 | learning rate: 1.981E-04 | global batch size: 512 | lm loss: 2.673641E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.842 | TFLOPs: 23.46 | 63: iteration 1850/ 24424 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 2.23 | learning rate: 1.980E-04 | global batch size: 512 | lm loss: 2.674889E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.810 | TFLOPs: 23.66 | 63: iteration 1860/ 24424 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 2.23 | learning rate: 1.980E-04 | global batch size: 512 | lm loss: 2.673508E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.331 | TFLOPs: 23.61 | 63: iteration 1870/ 24424 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 2.26 | learning rate: 1.980E-04 | global batch size: 512 | lm loss: 2.658292E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.115 | TFLOPs: 23.28 | 63: iteration 1880/ 24424 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 2.24 | learning rate: 1.980E-04 | global batch size: 512 | lm loss: 2.654223E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.399 | TFLOPs: 23.51 | 63: iteration 1890/ 24424 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 2.24 | learning rate: 1.980E-04 | global batch size: 512 | lm loss: 2.653256E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.332 | TFLOPs: 23.51 | 63: iteration 1900/ 24424 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 2.24 | learning rate: 1.979E-04 | global batch size: 512 | lm loss: 2.659357E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.611 | TFLOPs: 23.53 | 63: iteration 1910/ 24424 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 2.25 | learning rate: 1.979E-04 | global batch size: 512 | lm loss: 2.653588E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.480 | TFLOPs: 23.42 | 63: iteration 1920/ 24424 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 2.24 | learning rate: 1.979E-04 | global batch size: 512 | lm loss: 2.657680E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.794 | TFLOPs: 23.55 | 63: iteration 1930/ 24424 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 2.23 | learning rate: 1.979E-04 | global batch size: 512 | lm loss: 2.623164E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.193 | TFLOPs: 23.59 | 63: iteration 1940/ 24424 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 2.23 | learning rate: 1.978E-04 | global batch size: 512 | lm loss: 2.627484E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.870 | TFLOPs: 23.66 | 63: iteration 1950/ 24424 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 2.24 | learning rate: 1.978E-04 | global batch size: 512 | lm loss: 2.658267E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.192 | TFLOPs: 23.49 | 63: iteration 1960/ 24424 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 2.24 | learning rate: 1.978E-04 | global batch size: 512 | lm loss: 2.645794E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.348 | TFLOPs: 23.51 | 63: iteration 1970/ 24424 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 2.24 | learning rate: 1.977E-04 | global batch size: 512 | lm loss: 2.671682E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.471 | TFLOPs: 23.52 | 63: iteration 1980/ 24424 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 2.23 | learning rate: 1.977E-04 | global batch size: 512 | lm loss: 2.628388E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.679 | TFLOPs: 23.64 | 63: iteration 1990/ 24424 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 2.26 | learning rate: 1.977E-04 | global batch size: 512 | lm loss: 2.664319E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.001 | TFLOPs: 23.37 | 0: [2022-11-25 18:17:37,824] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00019766856618488637, 0.00019766856618488637, 0.00019766856618488637], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 2000/ 24424 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 2.25 | learning rate: 1.977E-04 | global batch size: 512 | lm loss: 2.634846E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.220 | TFLOPs: 23.39 | 0: steps: 2000 loss: 2.6490 iter time (s): 2.265 samples/sec: 226.014 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 2000 | lm loss value: 2.643706E+00 | lm loss PPL: 1.406523E+01 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 2000 to checkpoints_3b9 0: [2022-11-25 18:17:38,584] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! 32: [2022-11-25 18:17:38,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_21-model_00-model_states.pt... 0: [2022-11-25 18:17:38,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_01-model_00-model_states.pt... 32: [2022-11-25 18:17:38,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_21-model_00-model_states.pt. 32: [2022-11-25 18:17:38,933] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_22-model_00-model_states.pt... 0: [2022-11-25 18:17:39,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_01-model_00-model_states.pt. 0: [2022-11-25 18:17:39,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_03-model_00-model_states.pt... 32: [2022-11-25 18:17:39,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_22-model_00-model_states.pt. 32: [2022-11-25 18:17:39,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_23-model_00-model_states.pt... 0: [2022-11-25 18:17:39,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_03-model_00-model_states.pt. 0: [2022-11-25 18:17:39,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_04-model_00-model_states.pt... 32: [2022-11-25 18:17:39,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_23-model_00-model_states.pt. 32: [2022-11-25 18:17:39,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_24-model_00-model_states.pt... 0: [2022-11-25 18:17:39,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_04-model_00-model_states.pt. 0: [2022-11-25 18:17:39,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_05-model_00-model_states.pt... 32: [2022-11-25 18:17:39,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_24-model_00-model_states.pt. 32: [2022-11-25 18:17:39,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_25-model_00-model_states.pt... 0: [2022-11-25 18:17:39,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_05-model_00-model_states.pt. 0: [2022-11-25 18:17:39,865] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_06-model_00-model_states.pt... 32: [2022-11-25 18:17:39,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_25-model_00-model_states.pt. 32: [2022-11-25 18:17:39,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_26-model_00-model_states.pt... 0: [2022-11-25 18:17:40,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_06-model_00-model_states.pt. 0: [2022-11-25 18:17:40,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_07-model_00-model_states.pt... 32: [2022-11-25 18:17:40,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_26-model_00-model_states.pt. 32: [2022-11-25 18:17:40,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_27-model_00-model_states.pt... 0: [2022-11-25 18:17:40,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_07-model_00-model_states.pt. 0: [2022-11-25 18:17:40,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_08-model_00-model_states.pt... 32: [2022-11-25 18:17:40,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_27-model_00-model_states.pt. 32: [2022-11-25 18:17:40,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_28-model_00-model_states.pt... 0: [2022-11-25 18:17:40,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_08-model_00-model_states.pt. 0: [2022-11-25 18:17:40,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_09-model_00-model_states.pt... 32: [2022-11-25 18:17:40,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_28-model_00-model_states.pt. 32: [2022-11-25 18:17:40,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_29-model_00-model_states.pt... 0: [2022-11-25 18:17:40,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_09-model_00-model_states.pt. 0: [2022-11-25 18:17:40,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_10-model_00-model_states.pt... 32: [2022-11-25 18:17:40,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_29-model_00-model_states.pt. 32: [2022-11-25 18:17:40,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_30-model_00-model_states.pt... 0: [2022-11-25 18:17:40,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_10-model_00-model_states.pt. 0: [2022-11-25 18:17:40,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_11-model_00-model_states.pt... 32: [2022-11-25 18:17:41,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_30-model_00-model_states.pt. 32: [2022-11-25 18:17:41,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_31-model_00-model_states.pt... 0: [2022-11-25 18:17:41,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_11-model_00-model_states.pt. 0: [2022-11-25 18:17:41,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_12-model_00-model_states.pt... 32: [2022-11-25 18:17:41,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_31-model_00-model_states.pt. 32: [2022-11-25 18:17:41,354] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_32-model_00-model_states.pt... 0: [2022-11-25 18:17:41,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_12-model_00-model_states.pt. 0: [2022-11-25 18:17:41,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_13-model_00-model_states.pt... 32: [2022-11-25 18:17:41,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_32-model_00-model_states.pt. 32: [2022-11-25 18:17:41,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_33-model_00-model_states.pt... 0: [2022-11-25 18:17:41,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_13-model_00-model_states.pt. 0: [2022-11-25 18:17:41,649] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_14-model_00-model_states.pt... 32: [2022-11-25 18:17:41,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_33-model_00-model_states.pt. 32: [2022-11-25 18:17:41,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_34-model_00-model_states.pt... 0: [2022-11-25 18:17:41,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_14-model_00-model_states.pt. 0: [2022-11-25 18:17:41,862] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_15-model_00-model_states.pt... 32: [2022-11-25 18:17:42,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_34-model_00-model_states.pt. 32: [2022-11-25 18:17:42,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_35-model_00-model_states.pt... 0: [2022-11-25 18:17:42,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_15-model_00-model_states.pt. 0: [2022-11-25 18:17:42,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_16-model_00-model_states.pt... 32: [2022-11-25 18:17:42,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_35-model_00-model_states.pt. 32: [2022-11-25 18:17:42,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_36-model_00-model_states.pt... 0: [2022-11-25 18:17:42,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_16-model_00-model_states.pt. 0: [2022-11-25 18:17:42,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_17-model_00-model_states.pt... 32: [2022-11-25 18:17:42,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_36-model_00-model_states.pt. 32: [2022-11-25 18:17:42,478] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_37-model_00-model_states.pt... 0: [2022-11-25 18:17:42,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_17-model_00-model_states.pt. 0: [2022-11-25 18:17:42,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_18-model_00-model_states.pt... 32: [2022-11-25 18:17:42,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_37-model_00-model_states.pt. 32: [2022-11-25 18:17:42,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_38-model_00-model_states.pt... 0: [2022-11-25 18:17:42,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_18-model_00-model_states.pt. 0: [2022-11-25 18:17:42,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_19-model_00-model_states.pt... 32: [2022-11-25 18:17:42,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_38-model_00-model_states.pt. 32: [2022-11-25 18:17:42,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_40-model_00-model_states.pt... 32: [2022-11-25 18:17:42,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_40-model_00-model_states.pt. 32: [2022-11-25 18:17:42,921] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/mp_rank_01_model_states.pt... 32: [2022-11-25 18:17:42,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/mp_rank_01_model_states.pt. 0: [2022-11-25 18:17:42,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_19-model_00-model_states.pt. 0: [2022-11-25 18:17:42,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/layer_20-model_00-model_states.pt... 0: [2022-11-25 18:17:43,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/layer_20-model_00-model_states.pt. 0: [2022-11-25 18:17:43,166] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step2000/mp_rank_00_model_states.pt 0: [2022-11-25 18:17:43,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/mp_rank_00_model_states.pt... 0: [2022-11-25 18:17:43,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/mp_rank_00_model_states.pt. 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 21: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 52: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 32: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:17:43,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 61: [2022-11-25 18:17:43,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:17:43,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:17:43,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 18:17:43,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 39: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 9: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 43: [2022-11-25 18:17:43,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 13: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 49: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:17:43,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 18:17:43,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:17:43,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 63: [2022-11-25 18:17:43,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:17:43,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 63: [2022-11-25 18:17:43,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 8: [2022-11-25 18:17:43,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 48: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 10: [2022-11-25 18:17:43,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 48: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 18:17:43,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 18:17:43,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:17:43,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 7: [2022-11-25 18:17:43,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 18:17:43,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 29: [2022-11-25 18:17:43,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 50: [2022-11-25 18:17:43,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 42: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:17:43,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:17:43,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:17:43,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:17:43,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 43: [2022-11-25 18:17:43,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 18:17:43,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:17:43,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 18:17:43,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 59: [2022-11-25 18:17:43,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 24: [2022-11-25 18:17:43,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 59: [2022-11-25 18:17:43,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:17:43,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 37: [2022-11-25 18:17:43,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:17:43,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 8: [2022-11-25 18:17:43,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 37: [2022-11-25 18:17:43,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 18:17:43,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 25: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 56: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 25: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 56: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 25: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 56: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 52: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 0: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 52: [2022-11-25 18:17:43,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 0: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:17:43,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:17:43,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 42: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 3: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 62: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:17:43,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 62: [2022-11-25 18:17:43,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 29: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:17:43,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 21: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 41: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 21: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 21: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 57: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 18:17:43,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:17:43,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:17:43,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:17:43,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:17:43,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 18: [2022-11-25 18:17:43,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 53: [2022-11-25 18:17:43,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 53: [2022-11-25 18:17:43,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 0: [2022-11-25 18:17:43,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 26: [2022-11-25 18:17:43,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 44: [2022-11-25 18:17:43,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:17:43,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:17:43,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:17:43,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 18:17:43,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:17:43,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 49: [2022-11-25 18:17:43,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:43,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:43,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:17:43,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 18:17:43,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 27: [2022-11-25 18:17:43,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 45: [2022-11-25 18:17:43,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 18:17:43,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 18:17:43,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:17:43,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-25 18:17:43,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:17:43,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 18:17:43,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:17:43,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:17:43,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:17:43,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:17:43,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:17:43,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 18:17:43,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:17:43,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:17:43,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 37: [2022-11-25 18:17:43,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:17:43,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:17:43,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:17:43,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:17:43,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:43,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:43,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:17:43,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:17:43,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 15: [2022-11-25 18:17:43,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 52: [2022-11-25 18:17:43,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 18:17:43,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:17:43,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 18:17:43,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:17:43,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 18:17:43,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:17:43,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:17:43,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:17:43,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 37: [2022-11-25 18:17:43,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:17:43,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 18:17:43,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 18:17:43,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:17:43,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:17:43,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 18:17:43,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:17:43,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:17:43,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:17:43,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:17:43,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 18:17:43,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 18:17:43,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:17:43,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:17:43,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:17:43,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:17:43,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:17:43,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:17:43,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 61: [2022-11-25 18:17:43,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 4: [2022-11-25 18:17:43,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 61: [2022-11-25 18:17:43,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:17:43,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:17:43,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:17:43,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:43,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:43,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:17:43,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 8: [2022-11-25 18:17:43,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:17:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 28: [2022-11-25 18:17:43,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 34: [2022-11-25 18:17:43,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 28: [2022-11-25 18:17:43,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:17:43,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 18:17:43,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:17:43,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:17:43,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:17:43,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 18:17:43,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:17:43,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 18:17:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:17:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 10: [2022-11-25 18:17:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:17:43,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 18:17:43,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-25 18:17:43,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:17:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:17:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:17:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 38: [2022-11-25 18:17:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:17:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 6: [2022-11-25 18:17:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:17:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:17:43,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 36: [2022-11-25 18:17:43,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 56: [2022-11-25 18:17:43,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:17:43,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:17:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:17:43,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:17:43,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 55: [2022-11-25 18:17:43,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:17:43,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:43,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:43,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:17:43,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:17:43,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:17:43,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:17:43,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:17:43,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:17:43,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 18:17:43,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:17:43,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 52: [2022-11-25 18:17:43,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 7: [2022-11-25 18:17:43,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 52: [2022-11-25 18:17:43,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 7: [2022-11-25 18:17:43,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:17:43,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:17:43,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:17:43,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 18:17:43,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:17:43,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:17:43,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 18:17:43,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 18:17:43,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:17:43,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 18:17:43,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:17:43,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 37: [2022-11-25 18:17:43,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:17:43,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:17:43,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:43,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:43,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:43,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 18:17:43,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:17:43,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 12: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 33: [2022-11-25 18:17:43,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 18:17:43,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:17:43,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:17:43,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:43,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:43,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 18:17:43,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:17:43,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:17:43,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 47: [2022-11-25 18:17:43,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:17:43,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:17:43,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 18:17:43,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:17:43,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 18:17:43,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:17:43,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 18:17:43,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:17:43,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:17:43,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 18:17:43,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:17:43,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 18:17:43,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:17:43,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 18:17:43,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 18:17:43,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:17:43,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 18:17:43,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:17:43,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 44: [2022-11-25 18:17:43,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 14: [2022-11-25 18:17:43,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 44: [2022-11-25 18:17:43,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 18:17:43,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 18:17:43,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:17:43,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 18:17:43,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 10: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 35: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:17:43,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 18:17:43,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 18:17:43,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:17:43,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 18:17:43,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 18:17:43,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 18:17:43,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:17:43,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 18:17:43,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 58: [2022-11-25 18:17:43,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:17:43,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 58: [2022-11-25 18:17:43,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 10: [2022-11-25 18:17:43,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 36: [2022-11-25 18:17:43,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 7: [2022-11-25 18:17:43,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 36: [2022-11-25 18:17:43,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 7: [2022-11-25 18:17:43,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:17:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 49: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:17:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 11: [2022-11-25 18:17:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:17:43,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:17:43,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:17:43,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 18:17:43,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:17:43,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:17:43,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 18:17:43,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 18:17:43,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 18:17:43,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:17:43,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:17:43,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:17:43,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 18:17:43,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 43: [2022-11-25 18:17:43,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 12: [2022-11-25 18:17:43,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 18:17:43,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 18:17:43,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 18:17:43,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 18:17:43,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:17:43,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:17:43,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:17:43,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 18:17:43,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 18:17:43,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:17:43,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 18:17:43,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 56: [2022-11-25 18:17:43,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 27: [2022-11-25 18:17:43,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 19: [2022-11-25 18:17:43,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 18:17:43,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 53: [2022-11-25 18:17:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 60: [2022-11-25 18:17:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 38: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 38: [2022-11-25 18:17:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 16: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 18:17:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 52: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 52: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 10: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 62: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 62: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 8: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 18:17:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:17:43,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 18:17:43,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:17:43,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-25 18:17:43,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 18:17:43,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:17:43,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 18:17:43,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:17:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 18:17:43,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:17:43,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 18:17:43,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 18:17:43,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:17:43,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 18:17:43,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 18:17:43,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:17:43,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 18:17:43,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:17:43,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:17:43,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 18:17:43,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 18:17:43,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 18:17:43,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:17:43,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 18:17:43,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 18:17:43,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:43,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:17:43,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:17:43,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-25 18:17:43,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 18:17:43,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:43,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 44: [2022-11-25 18:17:43,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:43,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 18:17:43,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:17:43,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 18:17:43,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 18:17:43,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:17:43,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 18:17:43,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 18:17:43,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-25 18:17:43,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 18:17:43,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 17: [2022-11-25 18:17:43,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 41: [2022-11-25 18:17:43,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-25 18:17:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 18:17:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:17:43,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 18:17:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:17:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:17:43,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-25 18:17:43,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:17:43,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:17:43,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 18:17:43,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 18:17:43,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:17:43,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 18:17:43,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 18:17:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 18:17:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 18:17:43,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:43,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:43,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:43,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:17:43,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 18:17:43,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 18:17:43,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:17:43,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 18:17:43,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:17:43,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:17:43,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 15: [2022-11-25 18:17:43,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 27: [2022-11-25 18:17:43,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 18:17:43,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 18:17:43,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:17:43,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 18:17:43,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 18:17:43,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:17:43,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 18:17:43,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 18:17:43,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:17:43,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 18:17:43,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 18:17:43,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:17:43,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 18:17:43,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 18:17:43,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:17:43,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 18:17:43,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:17:43,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 18:17:43,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:17:43,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 18:17:43,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 18:17:43,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:17:43,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 18:17:43,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:17:43,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 18:17:43,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 18:17:43,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 18:17:43,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:17:43,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 18:17:43,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 18:17:43,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:17:43,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 18:17:43,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 18:17:43,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:17:43,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 18:17:43,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 18:17:43,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:17:43,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 18:17:43,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:44,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:44,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:44,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:44,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:44,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:44,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:44,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:44,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:44,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 18:17:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:17:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-25 18:17:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 18:17:44,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:17:44,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 18:17:44,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: successfully saved checkpoint at iteration 2000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5714.11 63: iteration 2010/ 24424 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 2.87 | learning rate: 1.976E-04 | global batch size: 512 | lm loss: 2.630501E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.333 | TFLOPs: 18.36 | 63: iteration 2020/ 24424 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 2.25 | learning rate: 1.976E-04 | global batch size: 512 | lm loss: 2.598577E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.408 | TFLOPs: 23.41 | 63: iteration 2030/ 24424 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 2.26 | learning rate: 1.976E-04 | global batch size: 512 | lm loss: 2.611788E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.369 | TFLOPs: 23.30 | 63: iteration 2040/ 24424 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 2.26 | learning rate: 1.976E-04 | global batch size: 512 | lm loss: 2.623412E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.178 | TFLOPs: 23.28 | 63: iteration 2050/ 24424 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 2.27 | learning rate: 1.975E-04 | global batch size: 512 | lm loss: 2.612036E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.540 | TFLOPs: 23.22 | 63: iteration 2060/ 24424 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 2.23 | learning rate: 1.975E-04 | global batch size: 512 | lm loss: 2.568743E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.344 | TFLOPs: 23.61 | 63: iteration 2070/ 24424 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 2.24 | learning rate: 1.975E-04 | global batch size: 512 | lm loss: 2.580281E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.674 | TFLOPs: 23.54 | 63: iteration 2080/ 24424 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 2.23 | learning rate: 1.975E-04 | global batch size: 512 | lm loss: 2.598982E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.328 | TFLOPs: 23.61 | 63: iteration 2090/ 24424 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 2.25 | learning rate: 1.974E-04 | global batch size: 512 | lm loss: 2.582003E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.553 | TFLOPs: 23.43 | 63: iteration 2100/ 24424 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 2.23 | learning rate: 1.974E-04 | global batch size: 512 | lm loss: 2.575577E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.461 | TFLOPs: 23.62 | 63: iteration 2110/ 24424 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 2.26 | learning rate: 1.974E-04 | global batch size: 512 | lm loss: 2.597261E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.677 | TFLOPs: 23.34 | 63: iteration 2120/ 24424 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 2.25 | learning rate: 1.973E-04 | global batch size: 512 | lm loss: 2.585278E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.519 | TFLOPs: 23.42 | 63: iteration 2130/ 24424 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 2.24 | learning rate: 1.973E-04 | global batch size: 512 | lm loss: 2.577351E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.544 | TFLOPs: 23.53 | 63: iteration 2140/ 24424 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 2.23 | learning rate: 1.973E-04 | global batch size: 512 | lm loss: 2.566891E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.130 | TFLOPs: 23.59 | 63: iteration 2150/ 24424 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 2.26 | learning rate: 1.973E-04 | global batch size: 512 | lm loss: 2.575481E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.297 | TFLOPs: 23.30 | 63: iteration 2160/ 24424 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 2.25 | learning rate: 1.972E-04 | global batch size: 512 | lm loss: 2.570197E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.707 | TFLOPs: 23.44 | 63: iteration 2170/ 24424 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 2.23 | learning rate: 1.972E-04 | global batch size: 512 | lm loss: 2.574154E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.251 | TFLOPs: 23.60 | 63: iteration 2180/ 24424 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 2.23 | learning rate: 1.972E-04 | global batch size: 512 | lm loss: 2.569882E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.520 | TFLOPs: 23.63 | 63: iteration 2190/ 24424 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 2.24 | learning rate: 1.971E-04 | global batch size: 512 | lm loss: 2.582661E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.024 | TFLOPs: 23.58 | 63: iteration 2200/ 24424 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 2.27 | learning rate: 1.971E-04 | global batch size: 512 | lm loss: 2.584447E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.725 | TFLOPs: 23.24 | 63: iteration 2210/ 24424 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 2.28 | learning rate: 1.971E-04 | global batch size: 512 | lm loss: 2.553738E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.902 | TFLOPs: 23.15 | 63: iteration 2220/ 24424 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 2.25 | learning rate: 1.971E-04 | global batch size: 512 | lm loss: 2.552167E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.129 | TFLOPs: 23.38 | 63: iteration 2230/ 24424 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 2.25 | learning rate: 1.970E-04 | global batch size: 512 | lm loss: 2.542210E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.968 | TFLOPs: 23.47 | 63: iteration 2240/ 24424 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 2.27 | learning rate: 1.970E-04 | global batch size: 512 | lm loss: 2.559502E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.958 | TFLOPs: 23.26 | 63: iteration 2250/ 24424 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 2.27 | learning rate: 1.970E-04 | global batch size: 512 | lm loss: 2.537384E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.097 | TFLOPs: 23.17 | 63: iteration 2260/ 24424 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 2.26 | learning rate: 1.969E-04 | global batch size: 512 | lm loss: 2.546655E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.531 | TFLOPs: 23.32 | 63: iteration 2270/ 24424 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 2.25 | learning rate: 1.969E-04 | global batch size: 512 | lm loss: 2.571982E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.688 | TFLOPs: 23.44 | 63: iteration 2280/ 24424 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 2.23 | learning rate: 1.969E-04 | global batch size: 512 | lm loss: 2.541688E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.733 | TFLOPs: 23.65 | 63: iteration 2290/ 24424 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 2.23 | learning rate: 1.968E-04 | global batch size: 512 | lm loss: 2.546316E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.482 | TFLOPs: 23.62 | 63: iteration 2300/ 24424 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 2.25 | learning rate: 1.968E-04 | global batch size: 512 | lm loss: 2.541488E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.749 | TFLOPs: 23.45 | 63: iteration 2310/ 24424 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 2.23 | learning rate: 1.968E-04 | global batch size: 512 | lm loss: 2.545014E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.752 | TFLOPs: 23.65 | 63: iteration 2320/ 24424 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 2.24 | learning rate: 1.967E-04 | global batch size: 512 | lm loss: 2.569561E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.055 | TFLOPs: 23.58 | 63: iteration 2330/ 24424 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 2.27 | learning rate: 1.967E-04 | global batch size: 512 | lm loss: 2.536801E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.323 | TFLOPs: 23.20 | 63: iteration 2340/ 24424 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 2.29 | learning rate: 1.967E-04 | global batch size: 512 | lm loss: 2.526478E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.729 | TFLOPs: 23.03 | 63: iteration 2350/ 24424 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 2.24 | learning rate: 1.967E-04 | global batch size: 512 | lm loss: 2.547458E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.469 | TFLOPs: 23.52 | 63: iteration 2360/ 24424 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 2.25 | learning rate: 1.966E-04 | global batch size: 512 | lm loss: 2.525115E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.426 | TFLOPs: 23.41 | 63: iteration 2370/ 24424 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 2.23 | learning rate: 1.966E-04 | global batch size: 512 | lm loss: 2.519418E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.520 | TFLOPs: 23.63 | 63: iteration 2380/ 24424 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 2.23 | learning rate: 1.966E-04 | global batch size: 512 | lm loss: 2.526603E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.603 | TFLOPs: 23.64 | 63: iteration 2390/ 24424 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 2.26 | learning rate: 1.965E-04 | global batch size: 512 | lm loss: 2.534482E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.313 | TFLOPs: 23.30 | 63: iteration 2400/ 24424 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 2.25 | learning rate: 1.965E-04 | global batch size: 512 | lm loss: 2.541116E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.056 | TFLOPs: 23.37 | 63: iteration 2410/ 24424 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 2.23 | learning rate: 1.965E-04 | global batch size: 512 | lm loss: 2.526513E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.178 | TFLOPs: 23.59 | 63: iteration 2420/ 24424 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 2.25 | learning rate: 1.964E-04 | global batch size: 512 | lm loss: 2.494288E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.475 | TFLOPs: 23.42 | 63: iteration 2430/ 24424 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 2.25 | learning rate: 1.964E-04 | global batch size: 512 | lm loss: 2.533417E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.595 | TFLOPs: 23.43 | 63: iteration 2440/ 24424 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 2.24 | learning rate: 1.964E-04 | global batch size: 512 | lm loss: 2.531666E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.855 | TFLOPs: 23.56 | 63: iteration 2450/ 24424 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 2.26 | learning rate: 1.963E-04 | global batch size: 512 | lm loss: 2.936887E+00 | grad norm: 1.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.092 | TFLOPs: 23.28 | 63: iteration 2460/ 24424 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 2.31 | learning rate: 1.963E-04 | global batch size: 512 | lm loss: 3.297261E+00 | grad norm: 2.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.080 | TFLOPs: 22.86 | 63: iteration 2470/ 24424 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 2.26 | learning rate: 1.963E-04 | global batch size: 512 | lm loss: 3.392698E+00 | grad norm: 1.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.674 | TFLOPs: 23.34 | 63: iteration 2480/ 24424 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 2.27 | learning rate: 1.962E-04 | global batch size: 512 | lm loss: 3.239055E+00 | grad norm: 0.889 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.877 | TFLOPs: 23.25 | 63: iteration 2490/ 24424 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 2.29 | learning rate: 1.962E-04 | global batch size: 512 | lm loss: 3.050708E+00 | grad norm: 0.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.885 | TFLOPs: 23.05 | 63: iteration 2500/ 24424 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 2.24 | learning rate: 1.962E-04 | global batch size: 512 | lm loss: 2.868542E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.127 | TFLOPs: 23.48 | 63: iteration 2510/ 24424 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 2.28 | learning rate: 1.961E-04 | global batch size: 512 | lm loss: 2.787912E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.137 | TFLOPs: 23.07 | 63: iteration 2520/ 24424 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 2.25 | learning rate: 1.961E-04 | global batch size: 512 | lm loss: 2.701893E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.584 | TFLOPs: 23.43 | 63: iteration 2530/ 24424 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 2.25 | learning rate: 1.961E-04 | global batch size: 512 | lm loss: 2.730347E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.818 | TFLOPs: 23.45 | 63: iteration 2540/ 24424 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 2.25 | learning rate: 1.960E-04 | global batch size: 512 | lm loss: 2.701884E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.954 | TFLOPs: 23.47 | 63: iteration 2550/ 24424 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 2.26 | learning rate: 1.960E-04 | global batch size: 512 | lm loss: 2.674618E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.335 | TFLOPs: 23.30 | 63: iteration 2560/ 24424 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 2.24 | learning rate: 1.960E-04 | global batch size: 512 | lm loss: 2.638983E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.892 | TFLOPs: 23.56 | 63: iteration 2570/ 24424 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 2.23 | learning rate: 1.959E-04 | global batch size: 512 | lm loss: 2.613602E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.109 | TFLOPs: 23.59 | 63: iteration 2580/ 24424 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 2.23 | learning rate: 1.959E-04 | global batch size: 512 | lm loss: 2.599682E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.424 | TFLOPs: 23.62 | 63: iteration 2590/ 24424 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 2.26 | learning rate: 1.959E-04 | global batch size: 512 | lm loss: 2.571127E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.079 | TFLOPs: 23.27 | 63: iteration 2600/ 24424 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 2.27 | learning rate: 1.958E-04 | global batch size: 512 | lm loss: 2.549672E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.990 | TFLOPs: 23.26 | 63: iteration 2610/ 24424 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 2.26 | learning rate: 1.958E-04 | global batch size: 512 | lm loss: 2.533034E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.245 | TFLOPs: 23.29 | 63: iteration 2620/ 24424 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 2.25 | learning rate: 1.957E-04 | global batch size: 512 | lm loss: 2.549764E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.964 | TFLOPs: 23.47 | 63: iteration 2630/ 24424 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 2.25 | learning rate: 1.957E-04 | global batch size: 512 | lm loss: 2.554407E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.461 | TFLOPs: 23.42 | 63: iteration 2640/ 24424 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 2.28 | learning rate: 1.957E-04 | global batch size: 512 | lm loss: 2.524119E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.857 | TFLOPs: 23.15 | 63: iteration 2650/ 24424 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 2.24 | learning rate: 1.956E-04 | global batch size: 512 | lm loss: 2.513865E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.380 | TFLOPs: 23.51 | 63: iteration 2660/ 24424 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 2.27 | learning rate: 1.956E-04 | global batch size: 512 | lm loss: 2.535285E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.128 | TFLOPs: 23.18 | 63: iteration 2670/ 24424 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 2.24 | learning rate: 1.956E-04 | global batch size: 512 | lm loss: 2.536213E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.631 | TFLOPs: 23.54 | 63: iteration 2680/ 24424 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 2.24 | learning rate: 1.955E-04 | global batch size: 512 | lm loss: 2.523011E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.238 | TFLOPs: 23.50 | 63: iteration 2690/ 24424 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 2.24 | learning rate: 1.955E-04 | global batch size: 512 | lm loss: 2.510397E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.213 | TFLOPs: 23.49 | 63: iteration 2700/ 24424 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 2.25 | learning rate: 1.955E-04 | global batch size: 512 | lm loss: 2.504667E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.317 | TFLOPs: 23.40 | 63: iteration 2710/ 24424 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 2.23 | learning rate: 1.954E-04 | global batch size: 512 | lm loss: 2.511738E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.478 | TFLOPs: 23.62 | 63: iteration 2720/ 24424 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 2.24 | learning rate: 1.954E-04 | global batch size: 512 | lm loss: 2.518962E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.570 | TFLOPs: 23.53 | 63: iteration 2730/ 24424 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 2.29 | learning rate: 1.953E-04 | global batch size: 512 | lm loss: 2.522426E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.632 | TFLOPs: 23.02 | 63: iteration 2740/ 24424 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 2.44 | learning rate: 1.953E-04 | global batch size: 512 | lm loss: 2.506214E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 209.911 | TFLOPs: 21.61 | 63: iteration 2750/ 24424 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 2.24 | learning rate: 1.953E-04 | global batch size: 512 | lm loss: 2.505681E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.985 | TFLOPs: 23.57 | 63: iteration 2760/ 24424 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 2.24 | learning rate: 1.952E-04 | global batch size: 512 | lm loss: 2.514589E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.902 | TFLOPs: 23.56 | 63: iteration 2770/ 24424 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 2.26 | learning rate: 1.952E-04 | global batch size: 512 | lm loss: 2.517260E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.205 | TFLOPs: 23.29 | 63: iteration 2780/ 24424 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 2.27 | learning rate: 1.952E-04 | global batch size: 512 | lm loss: 2.499677E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.974 | TFLOPs: 23.26 | 63: iteration 2790/ 24424 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 2.23 | learning rate: 1.951E-04 | global batch size: 512 | lm loss: 2.500266E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.334 | TFLOPs: 23.61 | 63: iteration 2800/ 24424 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 2.23 | learning rate: 1.951E-04 | global batch size: 512 | lm loss: 2.485058E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.344 | TFLOPs: 23.61 | 63: iteration 2810/ 24424 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 2.26 | learning rate: 1.950E-04 | global batch size: 512 | lm loss: 2.505387E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.716 | TFLOPs: 23.34 | 63: iteration 2820/ 24424 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 2.25 | learning rate: 1.950E-04 | global batch size: 512 | lm loss: 2.487042E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.768 | TFLOPs: 23.45 | 63: iteration 2830/ 24424 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 2.24 | learning rate: 1.950E-04 | global batch size: 512 | lm loss: 2.488708E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.269 | TFLOPs: 23.50 | 63: iteration 2840/ 24424 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 2.24 | learning rate: 1.949E-04 | global batch size: 512 | lm loss: 2.503806E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.716 | TFLOPs: 23.55 | 63: iteration 2850/ 24424 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 2.24 | learning rate: 1.949E-04 | global batch size: 512 | lm loss: 2.522513E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.982 | TFLOPs: 23.57 | 63: iteration 2860/ 24424 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 2.23 | learning rate: 1.949E-04 | global batch size: 512 | lm loss: 2.472391E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.338 | TFLOPs: 23.61 | 63: iteration 2870/ 24424 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 2.24 | learning rate: 1.948E-04 | global batch size: 512 | lm loss: 2.493634E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.699 | TFLOPs: 23.54 | 63: iteration 2880/ 24424 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 2.23 | learning rate: 1.948E-04 | global batch size: 512 | lm loss: 2.457017E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.629 | TFLOPs: 23.64 | 63: iteration 2890/ 24424 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 2.25 | learning rate: 1.947E-04 | global batch size: 512 | lm loss: 2.469222E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.232 | TFLOPs: 23.39 | 63: iteration 2900/ 24424 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 2.23 | learning rate: 1.947E-04 | global batch size: 512 | lm loss: 2.489079E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.501 | TFLOPs: 23.63 | 63: iteration 2910/ 24424 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 2.27 | learning rate: 1.947E-04 | global batch size: 512 | lm loss: 2.480329E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.690 | TFLOPs: 23.23 | 63: iteration 2920/ 24424 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 2.27 | learning rate: 1.946E-04 | global batch size: 512 | lm loss: 2.471290E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.422 | TFLOPs: 23.21 | 63: iteration 2930/ 24424 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 2.24 | learning rate: 1.946E-04 | global batch size: 512 | lm loss: 2.481621E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.580 | TFLOPs: 23.53 | 63: iteration 2940/ 24424 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 2.23 | learning rate: 1.945E-04 | global batch size: 512 | lm loss: 2.475592E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.399 | TFLOPs: 23.62 | 63: iteration 2950/ 24424 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 2.28 | learning rate: 1.945E-04 | global batch size: 512 | lm loss: 2.441512E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.715 | TFLOPs: 23.13 | 63: iteration 2960/ 24424 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 2.23 | learning rate: 1.945E-04 | global batch size: 512 | lm loss: 2.474324E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.627 | TFLOPs: 23.64 | 63: iteration 2970/ 24424 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 2.24 | learning rate: 1.944E-04 | global batch size: 512 | lm loss: 2.465449E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.642 | TFLOPs: 23.54 | 63: iteration 2980/ 24424 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 2.23 | learning rate: 1.944E-04 | global batch size: 512 | lm loss: 2.473308E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.351 | TFLOPs: 23.61 | 63: iteration 2990/ 24424 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 2.23 | learning rate: 1.943E-04 | global batch size: 512 | lm loss: 2.491000E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.555 | TFLOPs: 23.63 | 63: iteration 3000/ 24424 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 2.26 | learning rate: 1.943E-04 | global batch size: 512 | lm loss: 2.470905E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.609 | TFLOPs: 23.33 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 3000 | lm loss value: 2.493420E+00 | lm loss PPL: 1.210259E+01 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 3000 to checkpoints_3b9 0: [2022-11-25 18:55:16,629] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! 0: [2022-11-25 18:55:16,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_01-model_00-model_states.pt... 32: [2022-11-25 18:55:16,694] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_21-model_00-model_states.pt... 0: [2022-11-25 18:55:17,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_01-model_00-model_states.pt. 0: [2022-11-25 18:55:17,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_03-model_00-model_states.pt... 32: [2022-11-25 18:55:17,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_21-model_00-model_states.pt. 32: [2022-11-25 18:55:17,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_22-model_00-model_states.pt... 0: [2022-11-25 18:55:17,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_03-model_00-model_states.pt. 0: [2022-11-25 18:55:17,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_04-model_00-model_states.pt... 32: [2022-11-25 18:55:17,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_22-model_00-model_states.pt. 32: [2022-11-25 18:55:17,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_23-model_00-model_states.pt... 0: [2022-11-25 18:55:17,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_04-model_00-model_states.pt. 0: [2022-11-25 18:55:17,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_05-model_00-model_states.pt... 32: [2022-11-25 18:55:17,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_23-model_00-model_states.pt. 32: [2022-11-25 18:55:17,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_24-model_00-model_states.pt... 0: [2022-11-25 18:55:17,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_05-model_00-model_states.pt. 0: [2022-11-25 18:55:17,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_06-model_00-model_states.pt... 32: [2022-11-25 18:55:17,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_24-model_00-model_states.pt. 32: [2022-11-25 18:55:17,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_25-model_00-model_states.pt... 0: [2022-11-25 18:55:17,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_06-model_00-model_states.pt. 0: [2022-11-25 18:55:17,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_07-model_00-model_states.pt... 32: [2022-11-25 18:55:17,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_25-model_00-model_states.pt. 32: [2022-11-25 18:55:17,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_26-model_00-model_states.pt... 0: [2022-11-25 18:55:18,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_07-model_00-model_states.pt. 0: [2022-11-25 18:55:18,112] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_08-model_00-model_states.pt... 32: [2022-11-25 18:55:18,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_26-model_00-model_states.pt. 32: [2022-11-25 18:55:18,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_27-model_00-model_states.pt... 0: [2022-11-25 18:55:18,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_08-model_00-model_states.pt. 0: [2022-11-25 18:55:18,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_09-model_00-model_states.pt... 32: [2022-11-25 18:55:18,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_27-model_00-model_states.pt. 32: [2022-11-25 18:55:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_28-model_00-model_states.pt... 0: [2022-11-25 18:55:18,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_09-model_00-model_states.pt. 0: [2022-11-25 18:55:18,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_10-model_00-model_states.pt... 32: [2022-11-25 18:55:18,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_28-model_00-model_states.pt. 32: [2022-11-25 18:55:18,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_29-model_00-model_states.pt... 0: [2022-11-25 18:55:18,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_10-model_00-model_states.pt. 0: [2022-11-25 18:55:18,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_11-model_00-model_states.pt... 32: [2022-11-25 18:55:18,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_29-model_00-model_states.pt. 32: [2022-11-25 18:55:18,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_30-model_00-model_states.pt... 0: [2022-11-25 18:55:18,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_11-model_00-model_states.pt. 0: [2022-11-25 18:55:18,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_12-model_00-model_states.pt... 32: [2022-11-25 18:55:19,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_30-model_00-model_states.pt. 32: [2022-11-25 18:55:19,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_31-model_00-model_states.pt... 0: [2022-11-25 18:55:19,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_12-model_00-model_states.pt. 0: [2022-11-25 18:55:19,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_13-model_00-model_states.pt... 32: [2022-11-25 18:55:19,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_31-model_00-model_states.pt. 32: [2022-11-25 18:55:19,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_32-model_00-model_states.pt... 0: [2022-11-25 18:55:19,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_13-model_00-model_states.pt. 0: [2022-11-25 18:55:19,412] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_14-model_00-model_states.pt... 32: [2022-11-25 18:55:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_32-model_00-model_states.pt. 32: [2022-11-25 18:55:19,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_33-model_00-model_states.pt... 0: [2022-11-25 18:55:19,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_14-model_00-model_states.pt. 0: [2022-11-25 18:55:19,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_15-model_00-model_states.pt... 32: [2022-11-25 18:55:19,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_33-model_00-model_states.pt. 32: [2022-11-25 18:55:19,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_34-model_00-model_states.pt... 0: [2022-11-25 18:55:19,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_15-model_00-model_states.pt. 0: [2022-11-25 18:55:19,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_16-model_00-model_states.pt... 32: [2022-11-25 18:55:20,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_34-model_00-model_states.pt. 32: [2022-11-25 18:55:20,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_35-model_00-model_states.pt... 0: [2022-11-25 18:55:20,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_16-model_00-model_states.pt. 0: [2022-11-25 18:55:20,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_17-model_00-model_states.pt... 32: [2022-11-25 18:55:20,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_35-model_00-model_states.pt. 32: [2022-11-25 18:55:20,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_36-model_00-model_states.pt... 0: [2022-11-25 18:55:20,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_17-model_00-model_states.pt. 0: [2022-11-25 18:55:20,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_18-model_00-model_states.pt... 32: [2022-11-25 18:55:20,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_36-model_00-model_states.pt. 32: [2022-11-25 18:55:20,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_37-model_00-model_states.pt... 0: [2022-11-25 18:55:20,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_18-model_00-model_states.pt. 0: [2022-11-25 18:55:20,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_19-model_00-model_states.pt... 32: [2022-11-25 18:55:20,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_37-model_00-model_states.pt. 32: [2022-11-25 18:55:20,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_38-model_00-model_states.pt... 0: [2022-11-25 18:55:20,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_19-model_00-model_states.pt. 0: [2022-11-25 18:55:20,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_20-model_00-model_states.pt... 32: [2022-11-25 18:55:20,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_38-model_00-model_states.pt. 32: [2022-11-25 18:55:20,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/layer_40-model_00-model_states.pt... 32: [2022-11-25 18:55:20,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_40-model_00-model_states.pt. 32: [2022-11-25 18:55:20,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/mp_rank_01_model_states.pt... 32: [2022-11-25 18:55:20,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/mp_rank_01_model_states.pt. 0: [2022-11-25 18:55:20,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/layer_20-model_00-model_states.pt. 0: [2022-11-25 18:55:20,916] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step3000/mp_rank_00_model_states.pt 0: [2022-11-25 18:55:20,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/mp_rank_00_model_states.pt... 0: [2022-11-25 18:55:20,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/mp_rank_00_model_states.pt. 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 33: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 63: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 45: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 55: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 18: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 40: [2022-11-25 18:55:21,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 53: [2022-11-25 18:55:21,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 18:55:21,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:55:21,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 18:55:21,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:55:21,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:55:21,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:55:21,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:55:21,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:55:21,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:55:21,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:55:21,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:55:21,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:55:21,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 32: [2022-11-25 18:55:21,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:55:21,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:55:21,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:55:21,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:55:21,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:55:21,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 17: [2022-11-25 18:55:21,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 39: [2022-11-25 18:55:21,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 18:55:21,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 18:55:21,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:55:21,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:55:21,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:55:21,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:55:21,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 51: [2022-11-25 18:55:21,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:55:21,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 18:55:21,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 47: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 18:55:21,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-25 18:55:21,257] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 17: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 36: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 17: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 36: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 17: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 36: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 18:55:21,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:55:21,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 18:55:21,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:55:21,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 18:55:21,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:55:21,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:55:21,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 35: [2022-11-25 18:55:21,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:55:21,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:55:21,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 18:55:21,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 18:55:21,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 39: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 18:55:21,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:55:21,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:55:21,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:55:21,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 17: [2022-11-25 18:55:21,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 30: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 30: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:55:21,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:55:21,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:55:21,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:55:21,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:55:21,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:55:21,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:55:21,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 18:55:21,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:55:21,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 18:55:21,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:55:21,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:55:21,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:55:21,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:55:21,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:55:21,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:55:21,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:55:21,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:55:21,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:55:21,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:55:21,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:55:21,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:55:21,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:55:21,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 18:55:21,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:55:21,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:55:21,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 18:55:21,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-25 18:55:21,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 18:55:21,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 18:55:21,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 18:55:21,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:55:21,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:55:21,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 18:55:21,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 44: [2022-11-25 18:55:21,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 11: [2022-11-25 18:55:21,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 18:55:21,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-25 18:55:21,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:55:21,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:55:21,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:55:21,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 38: [2022-11-25 18:55:21,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:55:21,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 13: [2022-11-25 18:55:21,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 51: [2022-11-25 18:55:21,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:55:21,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:55:21,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 46: [2022-11-25 18:55:21,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 23: [2022-11-25 18:55:21,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 24: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 41: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:55:21,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 37: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:55:21,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 24: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:55:21,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:55:21,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:55:21,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 18:55:21,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 10: [2022-11-25 18:55:21,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 25: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 32: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 25: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 60: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 18: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 60: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 35: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 26: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 18:55:21,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 33: [2022-11-25 18:55:21,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:55:21,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 18:55:21,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 18:55:21,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:55:21,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 21: [2022-11-25 18:55:21,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:55:21,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:55:21,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 11: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 40: [2022-11-25 18:55:21,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 36: [2022-11-25 18:55:21,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 11: [2022-11-25 18:55:21,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:55:21,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-25 18:55:21,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 18:55:21,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:55:21,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:55:21,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-25 18:55:21,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 18:55:21,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 51: [2022-11-25 18:55:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 12: [2022-11-25 18:55:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 9: [2022-11-25 18:55:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 38: [2022-11-25 18:55:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:55:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 4: [2022-11-25 18:55:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 42: [2022-11-25 18:55:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-25 18:55:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 18:55:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:55:21,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 6: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 37: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 18:55:21,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 38: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:55:21,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 54: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 33: [2022-11-25 18:55:21,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 33: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 32: [2022-11-25 18:55:21,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:55:21,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 18:55:21,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 18:55:21,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:55:21,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:55:21,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:55:21,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:55:21,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 18:55:21,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 18:55:21,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:55:21,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 18:55:21,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 18:55:21,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:55:21,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 18:55:21,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:55:21,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 18:55:21,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:55:21,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 18:55:21,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:55:21,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 18:55:21,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 18:55:21,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 18:55:21,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 18:55:21,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:55:21,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 18:55:21,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:55:21,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:55:21,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 18:55:21,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 51: [2022-11-25 18:55:21,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 20: [2022-11-25 18:55:21,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 51: [2022-11-25 18:55:21,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 20: [2022-11-25 18:55:21,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 24: [2022-11-25 18:55:21,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 62: [2022-11-25 18:55:21,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 18:55:21,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 18:55:21,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:55:21,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:55:21,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 18:55:21,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:55:21,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:55:21,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:55:21,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 18:55:21,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-25 18:55:21,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 18:55:21,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 35: [2022-11-25 18:55:21,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:55:21,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:55:21,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 18:55:21,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:55:21,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-25 18:55:21,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 18:55:21,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 18:55:21,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:55:21,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 18:55:21,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 18:55:21,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 18:55:21,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:55:21,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 18:55:21,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 56: [2022-11-25 18:55:21,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 18:55:21,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:55:21,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 18:55:21,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:55:21,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 18:55:21,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-25 18:55:21,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-25 18:55:21,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 18:55:21,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 18:55:21,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 18:55:21,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:55:21,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 18:55:21,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 18:55:21,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 18:55:21,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 18:55:21,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 18:55:21,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 18:55:21,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:55:21,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 40: [2022-11-25 18:55:21,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 18: [2022-11-25 18:55:21,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 18:55:21,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 18:55:21,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:55:21,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:55:21,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 18:55:21,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-25 18:55:21,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-25 18:55:21,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 18:55:21,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 16: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 18:55:21,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 18:55:21,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 39: [2022-11-25 18:55:21,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 52: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 50: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 49: [2022-11-25 18:55:21,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 9: [2022-11-25 18:55:21,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 49: [2022-11-25 18:55:21,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 18:55:21,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 61: [2022-11-25 18:55:21,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 18:55:21,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 18:55:21,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 4: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 46: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 18:55:21,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:55:21,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 18:55:21,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:55:21,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 57: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 16: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 57: [2022-11-25 18:55:21,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 18:55:21,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 19: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 63: [2022-11-25 18:55:21,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 19: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 63: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 43: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 52: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 7: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 18:55:21,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 37: [2022-11-25 18:55:21,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 18:55:21,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:55:21,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:55:21,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 1: [2022-11-25 18:55:21,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 18:55:21,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 18:55:21,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 18:55:21,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 30: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 44: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:55:21,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 11: [2022-11-25 18:55:21,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 18:55:21,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 42: [2022-11-25 18:55:21,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 50: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 18:55:21,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 18:55:21,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 28: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 35: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 18:55:21,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:55:21,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 18:55:21,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 18:55:21,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:55:21,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 18:55:21,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 18:55:21,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:55:21,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 18:55:21,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 18:55:21,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:55:21,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 18:55:21,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 18:55:21,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:55:21,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 18:55:21,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 18:55:21,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:55:21,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 18:55:21,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 18:55:21,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:55:21,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 18:55:21,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 18:55:21,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:55:21,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 18:55:21,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:55:21,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 18:55:21,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 18:55:21,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:55:21,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 18:55:21,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 18:55:21,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:55:21,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 18:55:21,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 18:55:21,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:55:21,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 18:55:21,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 18:55:21,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:55:21,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 18:55:21,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 18:55:21,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:55:21,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 18:55:21,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 18:55:21,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:55:21,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 18:55:21,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 46: [2022-11-25 18:55:21,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:55:21,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 18:55:21,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 18:55:21,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 18:55:21,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-25 18:55:21,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-25 18:55:21,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 18:55:21,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:55:21,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 18:55:21,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 18:55:21,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 18:55:21,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 18:55:21,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 18:55:21,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:55:21,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 18:55:21,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 15: [2022-11-25 18:55:21,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:55:21,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 36: [2022-11-25 18:55:21,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 15: [2022-11-25 18:55:21,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 18:55:21,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 18:55:21,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:55:21,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-25 18:55:21,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:55:21,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:55:21,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 18:55:21,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:55:21,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-25 18:55:21,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 18:55:21,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-25 18:55:21,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step3000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-25 18:55:21,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: successfully saved checkpoint at iteration 3000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5223.68 63: iteration 3010/ 24424 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 2.82 | learning rate: 1.943E-04 | global batch size: 512 | lm loss: 2.471980E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.566 | TFLOPs: 18.69 | 63: iteration 3020/ 24424 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 2.25 | learning rate: 1.942E-04 | global batch size: 512 | lm loss: 2.470251E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.164 | TFLOPs: 23.39 | 63: iteration 3030/ 24424 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 2.24 | learning rate: 1.942E-04 | global batch size: 512 | lm loss: 2.443842E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.018 | TFLOPs: 23.58 | 63: iteration 3040/ 24424 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 2.27 | learning rate: 1.941E-04 | global batch size: 512 | lm loss: 2.480116E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.724 | TFLOPs: 23.24 | 63: iteration 3050/ 24424 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 2.24 | learning rate: 1.941E-04 | global batch size: 512 | lm loss: 2.464877E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.171 | TFLOPs: 23.49 | 63: iteration 3060/ 24424 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 2.23 | learning rate: 1.940E-04 | global batch size: 512 | lm loss: 2.434969E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.695 | TFLOPs: 23.65 | 63: iteration 3070/ 24424 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 2.23 | learning rate: 1.940E-04 | global batch size: 512 | lm loss: 2.451680E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.314 | TFLOPs: 23.61 | 63: iteration 3080/ 24424 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 2.27 | learning rate: 1.940E-04 | global batch size: 512 | lm loss: 2.440932E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.705 | TFLOPs: 23.24 | 63: iteration 3090/ 24424 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 2.24 | learning rate: 1.939E-04 | global batch size: 512 | lm loss: 2.439314E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.091 | TFLOPs: 23.48 | 63: iteration 3100/ 24424 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 2.26 | learning rate: 1.939E-04 | global batch size: 512 | lm loss: 2.458964E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.997 | TFLOPs: 23.37 | 63: iteration 3110/ 24424 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 2.36 | learning rate: 1.938E-04 | global batch size: 512 | lm loss: 2.472999E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 217.102 | TFLOPs: 22.35 | 63: iteration 3120/ 24424 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 2.34 | learning rate: 1.938E-04 | global batch size: 512 | lm loss: 2.475998E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.198 | TFLOPs: 22.57 | 63: iteration 3130/ 24424 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 2.25 | learning rate: 1.937E-04 | global batch size: 512 | lm loss: 2.452372E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.509 | TFLOPs: 23.42 | 63: iteration 3140/ 24424 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 2.25 | learning rate: 1.937E-04 | global batch size: 512 | lm loss: 2.438056E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.989 | TFLOPs: 23.47 | 63: iteration 3150/ 24424 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 2.26 | learning rate: 1.937E-04 | global batch size: 512 | lm loss: 2.433717E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.243 | TFLOPs: 23.29 | 63: iteration 3160/ 24424 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 2.23 | learning rate: 1.936E-04 | global batch size: 512 | lm loss: 2.435161E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.474 | TFLOPs: 23.62 | 63: iteration 3170/ 24424 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 2.26 | learning rate: 1.936E-04 | global batch size: 512 | lm loss: 2.438936E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.413 | TFLOPs: 23.31 | 63: iteration 3180/ 24424 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 2.24 | learning rate: 1.935E-04 | global batch size: 512 | lm loss: 2.440595E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.659 | TFLOPs: 23.54 | 63: iteration 3190/ 24424 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 2.26 | learning rate: 1.935E-04 | global batch size: 512 | lm loss: 2.439568E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.917 | TFLOPs: 23.36 | 63: iteration 3200/ 24424 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 2.23 | learning rate: 1.934E-04 | global batch size: 512 | lm loss: 2.434060E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.579 | TFLOPs: 23.63 | 63: iteration 3210/ 24424 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 2.25 | learning rate: 1.934E-04 | global batch size: 512 | lm loss: 2.434449E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.585 | TFLOPs: 23.43 | 63: iteration 3220/ 24424 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 2.24 | learning rate: 1.934E-04 | global batch size: 512 | lm loss: 2.435166E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.847 | TFLOPs: 23.56 | 63: iteration 3230/ 24424 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 2.23 | learning rate: 1.933E-04 | global batch size: 512 | lm loss: 2.414748E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.892 | TFLOPs: 23.67 | 63: iteration 3240/ 24424 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 2.23 | learning rate: 1.933E-04 | global batch size: 512 | lm loss: 2.427510E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.654 | TFLOPs: 23.64 | 63: iteration 3250/ 24424 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 2.24 | learning rate: 1.932E-04 | global batch size: 512 | lm loss: 2.411924E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.104 | TFLOPs: 23.48 | 63: iteration 3260/ 24424 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 2.24 | learning rate: 1.932E-04 | global batch size: 512 | lm loss: 2.438134E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.573 | TFLOPs: 23.53 | 63: iteration 3270/ 24424 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 2.23 | learning rate: 1.931E-04 | global batch size: 512 | lm loss: 2.412023E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.905 | TFLOPs: 23.67 | 63: iteration 3280/ 24424 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 2.25 | learning rate: 1.931E-04 | global batch size: 512 | lm loss: 2.437299E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.967 | TFLOPs: 23.47 | 63: iteration 3290/ 24424 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 2.23 | learning rate: 1.930E-04 | global batch size: 512 | lm loss: 2.432921E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.977 | TFLOPs: 23.67 | 63: iteration 3300/ 24424 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 2.31 | learning rate: 1.930E-04 | global batch size: 512 | lm loss: 2.424182E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.887 | TFLOPs: 22.84 | 63: iteration 3310/ 24424 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 2.23 | learning rate: 1.930E-04 | global batch size: 512 | lm loss: 2.417233E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.103 | TFLOPs: 23.59 | 63: iteration 3320/ 24424 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 2.27 | learning rate: 1.929E-04 | global batch size: 512 | lm loss: 2.420308E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.475 | TFLOPs: 23.21 | 63: iteration 3330/ 24424 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 2.25 | learning rate: 1.929E-04 | global batch size: 512 | lm loss: 2.417725E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.260 | TFLOPs: 23.40 | 63: iteration 3340/ 24424 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 2.24 | learning rate: 1.928E-04 | global batch size: 512 | lm loss: 2.405818E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.092 | TFLOPs: 23.48 | 63: iteration 3350/ 24424 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 2.26 | learning rate: 1.928E-04 | global batch size: 512 | lm loss: 2.424720E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.925 | TFLOPs: 23.36 | 63: iteration 3360/ 24424 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 2.24 | learning rate: 1.927E-04 | global batch size: 512 | lm loss: 2.418436E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.111 | TFLOPs: 23.48 | 63: iteration 3370/ 24424 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 2.23 | learning rate: 1.927E-04 | global batch size: 512 | lm loss: 2.431087E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.743 | TFLOPs: 23.65 | 63: iteration 3380/ 24424 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 2.24 | learning rate: 1.926E-04 | global batch size: 512 | lm loss: 2.404524E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.160 | TFLOPs: 23.49 | 63: iteration 3390/ 24424 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 2.26 | learning rate: 1.926E-04 | global batch size: 512 | lm loss: 2.384048E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.040 | TFLOPs: 23.37 | 63: iteration 3400/ 24424 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 2.27 | learning rate: 1.925E-04 | global batch size: 512 | lm loss: 2.435802E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.179 | TFLOPs: 23.18 | 63: iteration 3410/ 24424 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 2.28 | learning rate: 1.925E-04 | global batch size: 512 | lm loss: 2.404581E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.681 | TFLOPs: 23.13 | 63: iteration 3420/ 24424 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 2.23 | learning rate: 1.924E-04 | global batch size: 512 | lm loss: 2.416844E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.243 | TFLOPs: 23.60 | 63: iteration 3430/ 24424 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 2.23 | learning rate: 1.924E-04 | global batch size: 512 | lm loss: 2.399505E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.749 | TFLOPs: 23.65 | 63: iteration 3440/ 24424 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 2.27 | learning rate: 1.924E-04 | global batch size: 512 | lm loss: 2.434568E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.223 | TFLOPs: 23.19 | 63: iteration 3450/ 24424 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 2.23 | learning rate: 1.923E-04 | global batch size: 512 | lm loss: 2.368677E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.799 | TFLOPs: 23.66 | 63: iteration 3460/ 24424 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 2.25 | learning rate: 1.923E-04 | global batch size: 512 | lm loss: 2.405255E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.304 | TFLOPs: 23.40 | 63: iteration 3470/ 24424 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 2.26 | learning rate: 1.922E-04 | global batch size: 512 | lm loss: 2.407463E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.529 | TFLOPs: 23.32 | 63: iteration 3480/ 24424 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 2.26 | learning rate: 1.922E-04 | global batch size: 512 | lm loss: 2.411659E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.505 | TFLOPs: 23.32 | 63: iteration 3490/ 24424 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 2.23 | learning rate: 1.921E-04 | global batch size: 512 | lm loss: 2.382508E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.247 | TFLOPs: 23.60 | 63: iteration 3500/ 24424 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 2.25 | learning rate: 1.921E-04 | global batch size: 512 | lm loss: 2.419353E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.083 | TFLOPs: 23.38 | 63: iteration 3510/ 24424 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 2.24 | learning rate: 1.920E-04 | global batch size: 512 | lm loss: 2.410523E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.122 | TFLOPs: 23.48 | 63: iteration 3520/ 24424 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 2.23 | learning rate: 1.920E-04 | global batch size: 512 | lm loss: 2.396022E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.881 | TFLOPs: 23.67 | 63: iteration 3530/ 24424 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 2.27 | learning rate: 1.919E-04 | global batch size: 512 | lm loss: 2.401189E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.932 | TFLOPs: 23.26 | 63: iteration 3540/ 24424 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 2.25 | learning rate: 1.919E-04 | global batch size: 512 | lm loss: 2.394395E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.862 | TFLOPs: 23.46 | 63: iteration 3550/ 24424 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 2.25 | learning rate: 1.918E-04 | global batch size: 512 | lm loss: 2.389701E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.337 | TFLOPs: 23.40 | 63: iteration 3560/ 24424 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 2.26 | learning rate: 1.918E-04 | global batch size: 512 | lm loss: 2.410385E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.993 | TFLOPs: 23.37 | 63: iteration 3570/ 24424 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 2.24 | learning rate: 1.917E-04 | global batch size: 512 | lm loss: 2.400915E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.462 | TFLOPs: 23.52 | 63: iteration 3580/ 24424 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 2.24 | learning rate: 1.917E-04 | global batch size: 512 | lm loss: 2.409758E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.590 | TFLOPs: 23.53 | 63: iteration 3590/ 24424 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 2.25 | learning rate: 1.916E-04 | global batch size: 512 | lm loss: 2.413377E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.727 | TFLOPs: 23.44 | 63: iteration 3600/ 24424 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 2.25 | learning rate: 1.916E-04 | global batch size: 512 | lm loss: 2.380466E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.154 | TFLOPs: 23.38 | 63: iteration 3610/ 24424 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 2.23 | learning rate: 1.915E-04 | global batch size: 512 | lm loss: 2.393562E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.526 | TFLOPs: 23.63 | 63: iteration 3620/ 24424 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 2.25 | learning rate: 1.915E-04 | global batch size: 512 | lm loss: 2.406739E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.339 | TFLOPs: 23.40 | 63: iteration 3630/ 24424 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 2.25 | learning rate: 1.914E-04 | global batch size: 512 | lm loss: 2.396870E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.739 | TFLOPs: 23.44 | 63: iteration 3640/ 24424 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 2.24 | learning rate: 1.914E-04 | global batch size: 512 | lm loss: 2.411814E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.151 | TFLOPs: 23.49 | 63: iteration 3650/ 24424 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 2.25 | learning rate: 1.913E-04 | global batch size: 512 | lm loss: 2.407576E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.539 | TFLOPs: 23.42 | 63: iteration 3660/ 24424 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 2.28 | learning rate: 1.913E-04 | global batch size: 512 | lm loss: 2.393446E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.260 | TFLOPs: 23.09 | 63: iteration 3670/ 24424 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 2.28 | learning rate: 1.912E-04 | global batch size: 512 | lm loss: 2.384761E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.903 | TFLOPs: 23.15 | 63: iteration 3680/ 24424 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 2.24 | learning rate: 1.912E-04 | global batch size: 512 | lm loss: 2.397313E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.275 | TFLOPs: 23.50 | 63: iteration 3690/ 24424 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 2.26 | learning rate: 1.911E-04 | global batch size: 512 | lm loss: 2.387650E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.836 | TFLOPs: 23.35 | 63: iteration 3700/ 24424 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 2.25 | learning rate: 1.911E-04 | global batch size: 512 | lm loss: 2.378911E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.629 | TFLOPs: 23.43 | 63: iteration 3710/ 24424 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 2.23 | learning rate: 1.910E-04 | global batch size: 512 | lm loss: 2.382680E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.116 | TFLOPs: 23.59 | 63: iteration 3720/ 24424 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 2.23 | learning rate: 1.910E-04 | global batch size: 512 | lm loss: 2.367848E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.281 | TFLOPs: 23.60 | 63: iteration 3730/ 24424 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 2.24 | learning rate: 1.909E-04 | global batch size: 512 | lm loss: 2.390369E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.650 | TFLOPs: 23.54 | 63: iteration 3740/ 24424 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 2.39 | learning rate: 1.909E-04 | global batch size: 512 | lm loss: 2.361111E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 214.445 | TFLOPs: 22.08 | 63: iteration 3750/ 24424 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 2.28 | learning rate: 1.908E-04 | global batch size: 512 | lm loss: 2.371423E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.736 | TFLOPs: 23.14 | 63: iteration 3760/ 24424 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 2.25 | learning rate: 1.908E-04 | global batch size: 512 | lm loss: 2.383080E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.987 | TFLOPs: 23.47 | 63: iteration 3770/ 24424 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 2.24 | learning rate: 1.907E-04 | global batch size: 512 | lm loss: 2.396496E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.990 | TFLOPs: 23.57 | 63: iteration 3780/ 24424 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 2.24 | learning rate: 1.907E-04 | global batch size: 512 | lm loss: 2.364720E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.526 | TFLOPs: 23.53 | 63: iteration 3790/ 24424 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 2.25 | learning rate: 1.906E-04 | global batch size: 512 | lm loss: 2.387196E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.034 | TFLOPs: 23.47 | 63: iteration 3800/ 24424 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 2.24 | learning rate: 1.906E-04 | global batch size: 512 | lm loss: 2.389475E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.428 | TFLOPs: 23.52 | 63: iteration 3810/ 24424 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 2.24 | learning rate: 1.905E-04 | global batch size: 512 | lm loss: 2.379950E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.960 | TFLOPs: 23.57 | 63: iteration 3820/ 24424 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 2.26 | learning rate: 1.905E-04 | global batch size: 512 | lm loss: 2.377984E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.939 | TFLOPs: 23.36 | 63: iteration 3830/ 24424 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 2.44 | learning rate: 1.904E-04 | global batch size: 512 | lm loss: 2.364760E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 209.915 | TFLOPs: 21.61 | 63: iteration 3840/ 24424 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 3.04 | learning rate: 1.904E-04 | global batch size: 512 | lm loss: 2.368154E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.196 | TFLOPs: 17.31 | 63: iteration 3850/ 24424 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 3.04 | learning rate: 1.903E-04 | global batch size: 512 | lm loss: 2.379453E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.290 | TFLOPs: 17.32 | 63: iteration 3860/ 24424 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 2.25 | learning rate: 1.903E-04 | global batch size: 512 | lm loss: 2.472076E+00 | grad norm: 0.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.254 | TFLOPs: 23.39 | 63: iteration 3870/ 24424 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 2.27 | learning rate: 1.902E-04 | global batch size: 512 | lm loss: 3.176563E+00 | grad norm: 2.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.613 | TFLOPs: 23.23 | 63: iteration 3880/ 24424 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 2.25 | learning rate: 1.901E-04 | global batch size: 512 | lm loss: 3.225216E+00 | grad norm: 1.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.096 | TFLOPs: 23.38 | 63: iteration 3890/ 24424 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 2.26 | learning rate: 1.901E-04 | global batch size: 512 | lm loss: 3.157937E+00 | grad norm: 1.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.263 | TFLOPs: 23.29 | 63: iteration 3900/ 24424 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 2.27 | learning rate: 1.900E-04 | global batch size: 512 | lm loss: 3.099328E+00 | grad norm: 0.957 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.697 | TFLOPs: 23.23 | 63: iteration 3910/ 24424 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 2.24 | learning rate: 1.900E-04 | global batch size: 512 | lm loss: 2.966322E+00 | grad norm: 0.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.419 | TFLOPs: 23.51 | 63: iteration 3920/ 24424 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 2.25 | learning rate: 1.899E-04 | global batch size: 512 | lm loss: 2.766459E+00 | grad norm: 0.923 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.341 | TFLOPs: 23.40 | 63: iteration 3930/ 24424 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 2.24 | learning rate: 1.899E-04 | global batch size: 512 | lm loss: 2.670828E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.074 | TFLOPs: 23.48 | 63: iteration 3940/ 24424 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 2.23 | learning rate: 1.898E-04 | global batch size: 512 | lm loss: 2.585046E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.382 | TFLOPs: 23.61 | 63: iteration 3950/ 24424 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 2.23 | learning rate: 1.898E-04 | global batch size: 512 | lm loss: 2.528681E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.374 | TFLOPs: 23.61 | 63: iteration 3960/ 24424 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 2.27 | learning rate: 1.897E-04 | global batch size: 512 | lm loss: 2.499263E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.322 | TFLOPs: 23.20 | 63: iteration 3970/ 24424 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 2.23 | learning rate: 1.897E-04 | global batch size: 512 | lm loss: 2.470961E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.801 | TFLOPs: 23.66 | 63: iteration 3980/ 24424 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 2.24 | learning rate: 1.896E-04 | global batch size: 512 | lm loss: 2.461527E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.144 | TFLOPs: 23.49 | 63: iteration 3990/ 24424 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 2.24 | learning rate: 1.896E-04 | global batch size: 512 | lm loss: 2.443512E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.371 | TFLOPs: 23.51 | 0: [2022-11-25 19:33:10,904] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.00018949634970016766, 0.00018949634970016766, 0.00018949634970016766], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 4000/ 24424 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 2.23 | learning rate: 1.895E-04 | global batch size: 512 | lm loss: 2.448409E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.769 | TFLOPs: 23.65 | 0: steps: 4000 loss: 2.4169 iter time (s): 2.259 samples/sec: 226.640 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 4000 | lm loss value: 2.405044E+00 | lm loss PPL: 1.107892E+01 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 4000 to checkpoints_3b9 0: [2022-11-25 19:33:11,739] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! 0: [2022-11-25 19:33:11,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_01-model_00-model_states.pt... 32: [2022-11-25 19:33:11,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_21-model_00-model_states.pt... 32: [2022-11-25 19:33:12,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_21-model_00-model_states.pt. 32: [2022-11-25 19:33:12,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_22-model_00-model_states.pt... 0: [2022-11-25 19:33:12,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_01-model_00-model_states.pt. 0: [2022-11-25 19:33:12,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_03-model_00-model_states.pt... 32: [2022-11-25 19:33:12,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_22-model_00-model_states.pt. 32: [2022-11-25 19:33:12,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_23-model_00-model_states.pt... 0: [2022-11-25 19:33:12,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_03-model_00-model_states.pt. 0: [2022-11-25 19:33:12,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_04-model_00-model_states.pt... 32: [2022-11-25 19:33:12,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_23-model_00-model_states.pt. 32: [2022-11-25 19:33:12,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_24-model_00-model_states.pt... 0: [2022-11-25 19:33:12,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_04-model_00-model_states.pt. 0: [2022-11-25 19:33:12,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_05-model_00-model_states.pt... 32: [2022-11-25 19:33:12,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_24-model_00-model_states.pt. 32: [2022-11-25 19:33:12,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_25-model_00-model_states.pt... 0: [2022-11-25 19:33:13,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_05-model_00-model_states.pt. 0: [2022-11-25 19:33:13,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_06-model_00-model_states.pt... 32: [2022-11-25 19:33:13,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_25-model_00-model_states.pt. 32: [2022-11-25 19:33:13,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_26-model_00-model_states.pt... 0: [2022-11-25 19:33:13,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_06-model_00-model_states.pt. 0: [2022-11-25 19:33:13,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_07-model_00-model_states.pt... 32: [2022-11-25 19:33:13,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_26-model_00-model_states.pt. 32: [2022-11-25 19:33:13,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_27-model_00-model_states.pt... 0: [2022-11-25 19:33:13,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_07-model_00-model_states.pt. 0: [2022-11-25 19:33:13,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_08-model_00-model_states.pt... 32: [2022-11-25 19:33:13,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_27-model_00-model_states.pt. 32: [2022-11-25 19:33:13,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_28-model_00-model_states.pt... 0: [2022-11-25 19:33:13,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_08-model_00-model_states.pt. 0: [2022-11-25 19:33:13,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_09-model_00-model_states.pt... 32: [2022-11-25 19:33:13,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_28-model_00-model_states.pt. 32: [2022-11-25 19:33:13,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_29-model_00-model_states.pt... 0: [2022-11-25 19:33:13,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_09-model_00-model_states.pt. 0: [2022-11-25 19:33:13,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_10-model_00-model_states.pt... 32: [2022-11-25 19:33:14,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_29-model_00-model_states.pt. 32: [2022-11-25 19:33:14,133] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_30-model_00-model_states.pt... 0: [2022-11-25 19:33:14,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_10-model_00-model_states.pt. 0: [2022-11-25 19:33:14,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_11-model_00-model_states.pt... 32: [2022-11-25 19:33:14,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_30-model_00-model_states.pt. 32: [2022-11-25 19:33:14,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_31-model_00-model_states.pt... 0: [2022-11-25 19:33:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_11-model_00-model_states.pt. 0: [2022-11-25 19:33:14,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_12-model_00-model_states.pt... 32: [2022-11-25 19:33:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_31-model_00-model_states.pt. 32: [2022-11-25 19:33:14,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_32-model_00-model_states.pt... 0: [2022-11-25 19:33:14,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_12-model_00-model_states.pt. 0: [2022-11-25 19:33:14,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_13-model_00-model_states.pt... 32: [2022-11-25 19:33:14,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_32-model_00-model_states.pt. 32: [2022-11-25 19:33:14,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_33-model_00-model_states.pt... 0: [2022-11-25 19:33:14,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_13-model_00-model_states.pt. 0: [2022-11-25 19:33:14,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_14-model_00-model_states.pt... 32: [2022-11-25 19:33:15,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_33-model_00-model_states.pt. 32: [2022-11-25 19:33:15,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_34-model_00-model_states.pt... 0: [2022-11-25 19:33:15,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_14-model_00-model_states.pt. 0: [2022-11-25 19:33:15,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_15-model_00-model_states.pt... 32: [2022-11-25 19:33:15,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_34-model_00-model_states.pt. 32: [2022-11-25 19:33:15,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_35-model_00-model_states.pt... 0: [2022-11-25 19:33:15,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_15-model_00-model_states.pt. 0: [2022-11-25 19:33:15,303] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_16-model_00-model_states.pt... 32: [2022-11-25 19:33:15,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_35-model_00-model_states.pt. 32: [2022-11-25 19:33:15,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_36-model_00-model_states.pt... 0: [2022-11-25 19:33:15,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_16-model_00-model_states.pt. 0: [2022-11-25 19:33:15,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_17-model_00-model_states.pt... 32: [2022-11-25 19:33:15,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_36-model_00-model_states.pt. 32: [2022-11-25 19:33:15,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_37-model_00-model_states.pt... 0: [2022-11-25 19:33:15,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_17-model_00-model_states.pt. 0: [2022-11-25 19:33:15,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_18-model_00-model_states.pt... 0: [2022-11-25 19:33:15,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_18-model_00-model_states.pt. 0: [2022-11-25 19:33:15,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_19-model_00-model_states.pt... 32: [2022-11-25 19:33:15,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_37-model_00-model_states.pt. 32: [2022-11-25 19:33:15,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_38-model_00-model_states.pt... 0: [2022-11-25 19:33:16,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_19-model_00-model_states.pt. 0: [2022-11-25 19:33:16,173] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_20-model_00-model_states.pt... 32: [2022-11-25 19:33:16,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_38-model_00-model_states.pt. 32: [2022-11-25 19:33:16,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/layer_40-model_00-model_states.pt... 32: [2022-11-25 19:33:16,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_40-model_00-model_states.pt. 32: [2022-11-25 19:33:16,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/mp_rank_01_model_states.pt... 32: [2022-11-25 19:33:16,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/mp_rank_01_model_states.pt. 0: [2022-11-25 19:33:16,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/layer_20-model_00-model_states.pt. 0: [2022-11-25 19:33:16,391] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step4000/mp_rank_00_model_states.pt 0: [2022-11-25 19:33:16,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/mp_rank_00_model_states.pt... 0: [2022-11-25 19:33:16,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/mp_rank_00_model_states.pt. 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 32: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-25 19:33:16,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 53: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 61: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 54: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 62: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 14: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 8: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 11: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 29: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 35: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 40: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 44: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 32: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 46: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 48: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 45: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 5: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 12: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 32: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-25 19:33:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 32: [2022-11-25 19:33:16,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 19:33:16,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 39: [2022-11-25 19:33:16,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 26: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 39: [2022-11-25 19:33:16,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 26: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 19:33:16,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 19:33:16,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-25 19:33:16,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 19:33:16,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 46: [2022-11-25 19:33:16,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 12: [2022-11-25 19:33:16,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 46: [2022-11-25 19:33:16,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 28: [2022-11-25 19:33:16,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 57: [2022-11-25 19:33:16,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:16,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:16,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:16,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-25 19:33:16,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-25 19:33:16,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:16,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:16,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 57: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 10: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 19:33:16,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 19:33:16,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 19:33:16,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:16,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:16,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:16,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 19:33:16,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 19:33:16,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:16,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:16,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:16,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 23: [2022-11-25 19:33:16,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 63: [2022-11-25 19:33:16,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 23: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 51: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 63: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 23: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 63: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 19:33:16,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:16,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-25 19:33:16,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:16,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:16,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:16,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 24: [2022-11-25 19:33:16,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 40: [2022-11-25 19:33:16,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:16,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:16,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 19:33:16,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:16,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 19:33:16,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:16,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 19:33:16,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 19:33:16,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-25 19:33:16,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:16,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:16,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:16,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:16,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:16,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:16,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:16,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:16,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:16,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-25 19:33:16,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:16,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:16,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:16,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 19:33:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 12: [2022-11-25 19:33:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 59: [2022-11-25 19:33:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-25 19:33:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:16,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:16,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 24: [2022-11-25 19:33:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-25 19:33:16,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:16,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 19:33:16,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:16,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:16,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:16,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:16,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 44: [2022-11-25 19:33:16,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 18: [2022-11-25 19:33:16,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:16,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:16,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-25 19:33:16,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 2: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:16,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 19:33:16,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 19:33:16,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:16,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 20: [2022-11-25 19:33:16,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 19:33:16,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 19:33:16,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:16,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 19:33:16,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:16,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:16,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:16,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 19:33:16,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 19:33:16,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:16,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:16,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:16,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:16,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:16,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 19:33:16,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 19:33:16,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:16,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:16,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:16,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:16,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:16,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:16,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:16,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:16,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:16,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:16,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:16,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:16,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:16,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:16,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:16,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:16,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:16,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:16,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:16,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:16,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 50: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 11: [2022-11-25 19:33:16,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 50: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:16,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:16,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:16,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-25 19:33:16,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 19:33:16,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:16,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 19:33:16,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 19:33:16,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:16,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:16,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:16,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:16,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 19:33:16,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:16,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-25 19:33:16,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:16,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:16,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 19:33:16,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:16,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 19:33:16,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 19:33:16,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 19:33:16,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:16,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:16,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:16,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:16,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:16,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:16,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 19:33:16,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:16,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:16,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:16,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 19:33:16,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:16,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 19:33:16,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:16,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:16,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:16,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:16,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:16,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:16,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:16,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 19:33:16,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:16,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:16,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:16,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 28: [2022-11-25 19:33:16,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 60: [2022-11-25 19:33:16,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:16,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:16,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-25 19:33:16,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-25 19:33:16,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 19:33:16,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-25 19:33:16,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-25 19:33:16,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:16,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:16,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:16,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:16,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-25 19:33:16,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:16,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:16,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:16,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:16,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:16,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:16,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-25 19:33:16,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 49: [2022-11-25 19:33:16,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 15: [2022-11-25 19:33:16,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 34: [2022-11-25 19:33:16,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 15: [2022-11-25 19:33:16,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 19:33:16,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:16,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 19:33:16,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:16,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 13: [2022-11-25 19:33:16,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:16,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 63: [2022-11-25 19:33:16,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 19:33:16,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 8: [2022-11-25 19:33:16,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 54: [2022-11-25 19:33:16,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 8: [2022-11-25 19:33:16,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-25 19:33:16,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 19:33:16,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:16,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:16,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:16,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:16,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:16,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:16,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:16,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:16,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:16,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:16,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:16,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:16,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:16,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 19:33:16,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 19:33:16,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:16,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:16,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:16,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 19:33:16,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:16,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:16,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:16,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:16,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 19:33:16,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:16,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 41: [2022-11-25 19:33:16,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 5: [2022-11-25 19:33:16,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 41: [2022-11-25 19:33:16,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 5: [2022-11-25 19:33:16,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:16,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:16,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:16,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:16,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 19:33:16,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:16,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:16,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:16,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:16,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 19:33:16,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:16,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:16,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:16,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:16,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 19:33:16,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:16,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:16,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:16,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:16,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:16,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:16,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 19:33:16,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:16,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:16,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 19:33:16,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:16,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 19:33:16,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:16,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:16,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:16,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:16,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 19:33:16,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:16,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:16,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:16,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 19:33:16,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:16,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:16,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:16,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:16,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 19:33:16,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 19:33:16,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 19:33:16,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 19:33:16,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 19:33:16,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 19:33:16,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-25 19:33:16,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-25 19:33:16,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:16,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:16,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:16,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 19:33:16,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-25 19:33:16,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 36: [2022-11-25 19:33:16,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 20: [2022-11-25 19:33:16,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 36: [2022-11-25 19:33:16,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:16,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 52: [2022-11-25 19:33:16,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 19:33:16,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 52: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 19:33:16,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:16,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-25 19:33:16,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 19:33:16,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:16,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:16,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:16,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 19:33:16,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:16,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:16,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:16,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:16,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:16,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:16,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:16,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 19:33:16,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 19:33:16,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 19:33:16,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:16,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:16,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 24: [2022-11-25 19:33:16,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:16,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:16,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:16,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 19:33:16,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:16,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:16,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:16,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:16,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:16,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 19:33:16,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 18: [2022-11-25 19:33:16,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 4: [2022-11-25 19:33:16,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:16,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 19:33:16,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:16,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:16,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:16,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:16,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 19:33:16,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:16,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:16,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:16,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:16,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:16,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:16,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:16,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 16: [2022-11-25 19:33:16,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:16,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 19:33:16,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:16,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:16,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:16,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:16,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 19:33:16,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 19:33:16,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 19:33:16,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:16,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:16,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:16,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:16,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 19:33:16,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:16,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 13: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:16,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 19:33:16,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 19:33:16,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:16,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 19:33:16,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 19:33:16,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-25 19:33:16,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-25 19:33:16,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:16,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:16,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 19:33:16,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 19:33:16,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 19:33:16,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 19:33:16,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 19:33:16,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-25 19:33:16,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 19:33:16,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 19:33:16,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 19:33:16,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 19:33:16,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 19:33:16,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-25 19:33:16,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 19:33:16,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 19:33:16,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 19:33:16,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 19:33:16,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:16,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 51: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 20: [2022-11-25 19:33:16,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:16,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 19:33:16,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 19:33:16,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 19:33:16,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:16,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:16,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:16,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:16,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:16,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 19:33:16,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:16,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 19:33:16,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-25 19:33:16,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 19:33:16,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:16,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 42: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 7: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:16,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:16,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:16,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 19:33:16,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:16,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 41: [2022-11-25 19:33:16,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:16,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:16,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:16,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:16,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 19:33:16,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-25 19:33:16,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-25 19:33:16,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:16,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:16,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:16,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:16,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:16,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 19:33:16,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 19:33:16,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 19:33:16,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 19:33:16,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:16,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:16,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:16,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:16,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:16,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:16,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:16,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 19:33:16,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:16,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:16,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:16,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 58: [2022-11-25 19:33:16,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 24: [2022-11-25 19:33:16,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 58: [2022-11-25 19:33:16,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:16,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:16,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 19:33:16,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 19:33:16,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:16,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:16,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:16,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:16,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:16,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 19:33:16,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:16,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:16,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:16,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:16,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:16,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 19: [2022-11-25 19:33:16,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 37: [2022-11-25 19:33:16,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:16,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:16,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 19:33:16,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:16,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 19:33:16,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:16,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:16,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:16,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:16,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:16,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:16,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:16,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:16,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 19:33:16,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:16,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 19:33:16,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:16,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:16,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:16,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:16,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:16,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-25 19:33:16,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-25 19:33:16,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:16,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:16,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 19:33:16,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 19:33:16,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-25 19:33:16,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-25 19:33:16,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:16,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 19:33:16,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 19:33:16,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:16,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 19:33:16,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 19:33:16,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:16,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-25 19:33:16,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:16,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:16,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:16,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:16,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:16,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:16,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-25 19:33:16,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:16,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:16,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:16,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:16,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 19:33:16,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:16,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-25 19:33:16,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:16,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 19:33:16,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:16,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:16,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:16,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:16,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:16,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:16,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 19:33:16,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 19:33:16,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:16,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:16,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:16,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:16,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:16,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:16,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:16,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:16,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:16,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:16,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:16,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:16,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:16,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:16,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:16,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:16,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:16,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 19:33:16,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:16,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:16,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 23: [2022-11-25 19:33:16,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:16,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:16,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:16,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:16,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 19:33:16,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 19:33:16,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:16,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 19:33:16,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:16,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:17,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:17,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:17,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:17,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:17,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:17,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:17,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:17,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:17,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:17,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:17,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:17,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 60: [2022-11-25 19:33:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 4: [2022-11-25 19:33:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:17,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-25 19:33:17,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:17,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:17,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:17,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:17,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:17,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:17,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-25 19:33:17,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:17,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:17,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-25 19:33:17,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:17,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:17,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:17,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:17,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 19:33:17,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:17,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:17,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:17,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:17,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:17,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 61: [2022-11-25 19:33:17,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 1: [2022-11-25 19:33:17,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 61: [2022-11-25 19:33:17,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 1: [2022-11-25 19:33:17,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:17,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 19:33:17,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 19:33:17,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 19:33:17,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 19:33:17,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 19:33:17,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 19:33:17,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:17,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 19:33:17,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 19:33:17,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 19:33:17,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:17,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 50: [2022-11-25 19:33:17,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 40: [2022-11-25 19:33:17,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 50: [2022-11-25 19:33:17,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:17,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 19:33:17,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 56: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 63: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 56: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 63: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 58: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 3: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 41: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 41: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 58: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 41: [2022-11-25 19:33:17,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 3: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 19:33:17,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:17,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 19:33:17,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 19:33:17,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 28: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 61: [2022-11-25 19:33:17,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:17,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 28: [2022-11-25 19:33:17,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:17,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 19:33:17,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-25 19:33:17,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:17,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 18: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 33: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 6: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 62: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 33: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 6: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 18: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 33: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 7: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 34: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 42: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 53: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 7: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 5: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 7: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 42: [2022-11-25 19:33:17,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 5: [2022-11-25 19:33:17,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 19:33:17,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 19:33:17,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 19:33:17,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 19:33:17,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 19:33:17,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 19:33:17,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 19:33:17,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 4: [2022-11-25 19:33:17,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 40: [2022-11-25 19:33:17,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 19:33:17,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 19:33:17,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:17,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-25 19:33:17,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 38: [2022-11-25 19:33:17,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 19:33:17,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 19:33:17,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 19:33:17,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 19:33:17,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 19:33:17,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 19:33:17,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 19:33:17,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 48: [2022-11-25 19:33:17,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 17: [2022-11-25 19:33:17,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 48: [2022-11-25 19:33:17,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 17: [2022-11-25 19:33:17,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 19:33:17,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 19:33:17,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 19:33:17,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 19:33:17,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 19:33:17,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 19:33:17,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 19:33:17,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:17,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:17,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:17,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-25 19:33:17,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 19:33:17,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 19:33:17,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 19:33:17,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 19:33:17,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 19:33:17,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 19:33:17,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 19:33:17,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 19:33:17,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 19:33:17,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-25 19:33:17,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-25 19:33:17,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 19:33:17,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 19:33:17,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 19:33:17,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 19:33:17,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-25 19:33:17,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-25 19:33:17,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 19:33:17,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 19:33:17,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 19:33:17,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 19:33:17,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-25 19:33:17,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 19:33:17,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 19:33:17,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 19:33:17,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 19:33:17,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 19:33:17,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 19:33:17,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 19:33:17,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 19:33:17,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-25 19:33:17,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-25 19:33:17,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 19:33:17,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 19:33:17,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 19:33:17,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 19:33:17,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-25 19:33:17,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 19:33:17,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 19:33:17,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 19:33:17,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step4000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 19:33:17,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: successfully saved checkpoint at iteration 4000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5463.81 63: iteration 4010/ 24424 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 2.84 | learning rate: 1.894E-04 | global batch size: 512 | lm loss: 2.433550E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.072 | TFLOPs: 18.54 | 63: iteration 4020/ 24424 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 2.25 | learning rate: 1.894E-04 | global batch size: 512 | lm loss: 2.407669E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.001 | TFLOPs: 23.47 | 63: iteration 4030/ 24424 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 2.23 | learning rate: 1.893E-04 | global batch size: 512 | lm loss: 2.417696E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.856 | TFLOPs: 23.66 | 63: iteration 4040/ 24424 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 2.23 | learning rate: 1.893E-04 | global batch size: 512 | lm loss: 2.393891E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.609 | TFLOPs: 23.64 | 63: iteration 4050/ 24424 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 2.25 | learning rate: 1.892E-04 | global batch size: 512 | lm loss: 2.395481E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.502 | TFLOPs: 23.42 | 63: iteration 4060/ 24424 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 2.25 | learning rate: 1.892E-04 | global batch size: 512 | lm loss: 2.390566E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.723 | TFLOPs: 23.44 | 63: iteration 4070/ 24424 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 2.25 | learning rate: 1.891E-04 | global batch size: 512 | lm loss: 2.395267E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.477 | TFLOPs: 23.42 | 63: iteration 4080/ 24424 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 2.25 | learning rate: 1.891E-04 | global batch size: 512 | lm loss: 2.393430E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.132 | TFLOPs: 23.38 | 63: iteration 4090/ 24424 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 2.25 | learning rate: 1.890E-04 | global batch size: 512 | lm loss: 2.381086E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.622 | TFLOPs: 23.43 | 63: iteration 4100/ 24424 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 2.24 | learning rate: 1.889E-04 | global batch size: 512 | lm loss: 2.424920E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.655 | TFLOPs: 23.54 | 63: iteration 4110/ 24424 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 2.27 | learning rate: 1.889E-04 | global batch size: 512 | lm loss: 2.392635E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.183 | TFLOPs: 23.18 | 63: iteration 4120/ 24424 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 2.27 | learning rate: 1.888E-04 | global batch size: 512 | lm loss: 2.368168E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.147 | TFLOPs: 23.18 | 63: iteration 4130/ 24424 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 2.26 | learning rate: 1.888E-04 | global batch size: 512 | lm loss: 2.399220E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.796 | TFLOPs: 23.35 | 63: iteration 4140/ 24424 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 2.26 | learning rate: 1.887E-04 | global batch size: 512 | lm loss: 2.379981E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.540 | TFLOPs: 23.32 | 63: iteration 4150/ 24424 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 2.29 | learning rate: 1.887E-04 | global batch size: 512 | lm loss: 2.408694E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.282 | TFLOPs: 22.99 | 63: iteration 4160/ 24424 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 2.25 | learning rate: 1.886E-04 | global batch size: 512 | lm loss: 2.392412E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.568 | TFLOPs: 23.43 | 63: iteration 4170/ 24424 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 2.28 | learning rate: 1.885E-04 | global batch size: 512 | lm loss: 2.405133E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.084 | TFLOPs: 23.07 | 63: iteration 4180/ 24424 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 2.27 | learning rate: 1.885E-04 | global batch size: 512 | lm loss: 2.385162E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.378 | TFLOPs: 23.20 | 63: iteration 4190/ 24424 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 2.28 | learning rate: 1.884E-04 | global batch size: 512 | lm loss: 2.358143E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.415 | TFLOPs: 23.10 | 63: iteration 4200/ 24424 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 2.25 | learning rate: 1.884E-04 | global batch size: 512 | lm loss: 2.356046E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.320 | TFLOPs: 23.40 | 63: iteration 4210/ 24424 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 2.25 | learning rate: 1.883E-04 | global batch size: 512 | lm loss: 2.341805E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.084 | TFLOPs: 23.38 | 63: iteration 4220/ 24424 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 2.26 | learning rate: 1.883E-04 | global batch size: 512 | lm loss: 2.372881E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.004 | TFLOPs: 23.37 | 63: iteration 4230/ 24424 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 2.27 | learning rate: 1.882E-04 | global batch size: 512 | lm loss: 2.382625E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.938 | TFLOPs: 23.26 | 63: iteration 4240/ 24424 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 2.26 | learning rate: 1.881E-04 | global batch size: 512 | lm loss: 2.353213E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.568 | TFLOPs: 23.32 | 63: iteration 4250/ 24424 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 2.30 | learning rate: 1.881E-04 | global batch size: 512 | lm loss: 2.379504E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.444 | TFLOPs: 22.90 | 63: iteration 4260/ 24424 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 2.28 | learning rate: 1.880E-04 | global batch size: 512 | lm loss: 2.331095E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.712 | TFLOPs: 23.13 | 63: iteration 4270/ 24424 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 2.26 | learning rate: 1.880E-04 | global batch size: 512 | lm loss: 2.350667E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.182 | TFLOPs: 23.28 | 63: iteration 4280/ 24424 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 2.25 | learning rate: 1.879E-04 | global batch size: 512 | lm loss: 2.357227E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.745 | TFLOPs: 23.45 | 63: iteration 4290/ 24424 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 2.26 | learning rate: 1.879E-04 | global batch size: 512 | lm loss: 2.357798E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.868 | TFLOPs: 23.35 | 63: iteration 4300/ 24424 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 207.48 | learning rate: 1.878E-04 | global batch size: 512 | lm loss: 2.379171E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 2.468 | TFLOPs: 0.25 | 63: iteration 4310/ 24424 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 27.56 | learning rate: 1.877E-04 | global batch size: 512 | lm loss: 2.365800E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 18.575 | TFLOPs: 1.91 | 63: iteration 4320/ 24424 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 2.23 | learning rate: 1.877E-04 | global batch size: 512 | lm loss: 2.336031E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.475 | TFLOPs: 23.62 | 63: iteration 4330/ 24424 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 2.25 | learning rate: 1.876E-04 | global batch size: 512 | lm loss: 2.341244E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.668 | TFLOPs: 23.44 | 63: iteration 4340/ 24424 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 2.24 | learning rate: 1.876E-04 | global batch size: 512 | lm loss: 2.348012E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.917 | TFLOPs: 23.57 | 63: iteration 4350/ 24424 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 2.76 | learning rate: 1.875E-04 | global batch size: 512 | lm loss: 2.340361E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.653 | TFLOPs: 19.11 | 63: iteration 4360/ 24424 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 2.24 | learning rate: 1.874E-04 | global batch size: 512 | lm loss: 2.329967E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.495 | TFLOPs: 23.52 | 63: iteration 4370/ 24424 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 2.24 | learning rate: 1.874E-04 | global batch size: 512 | lm loss: 2.345143E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.353 | TFLOPs: 23.51 | 63: iteration 4380/ 24424 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 2.26 | learning rate: 1.873E-04 | global batch size: 512 | lm loss: 2.353806E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.692 | TFLOPs: 23.34 | 63: iteration 4390/ 24424 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 2.23 | learning rate: 1.873E-04 | global batch size: 512 | lm loss: 2.328944E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.220 | TFLOPs: 23.60 | 63: iteration 4400/ 24424 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 2.24 | learning rate: 1.872E-04 | global batch size: 512 | lm loss: 2.339562E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.225 | TFLOPs: 23.49 | 63: iteration 4410/ 24424 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 2.29 | learning rate: 1.871E-04 | global batch size: 512 | lm loss: 2.339224E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.666 | TFLOPs: 23.03 | 63: iteration 4420/ 24424 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 2.25 | learning rate: 1.871E-04 | global batch size: 512 | lm loss: 2.352766E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.911 | TFLOPs: 23.46 | 63: iteration 4430/ 24424 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 2.28 | learning rate: 1.870E-04 | global batch size: 512 | lm loss: 2.341785E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.193 | TFLOPs: 23.08 | 63: iteration 4440/ 24424 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 2.23 | learning rate: 1.870E-04 | global batch size: 512 | lm loss: 2.330803E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.569 | TFLOPs: 23.63 | 63: iteration 4450/ 24424 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 2.25 | learning rate: 1.869E-04 | global batch size: 512 | lm loss: 2.330899E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.424 | TFLOPs: 23.41 | 63: iteration 4460/ 24424 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 2.23 | learning rate: 1.868E-04 | global batch size: 512 | lm loss: 2.328222E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.477 | TFLOPs: 23.62 | 63: iteration 4470/ 24424 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 2.24 | learning rate: 1.868E-04 | global batch size: 512 | lm loss: 2.326114E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.094 | TFLOPs: 23.48 | 63: iteration 4480/ 24424 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 2.23 | learning rate: 1.867E-04 | global batch size: 512 | lm loss: 2.331432E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.272 | TFLOPs: 23.60 | 63: iteration 4490/ 24424 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 2.28 | learning rate: 1.867E-04 | global batch size: 512 | lm loss: 2.324170E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.433 | TFLOPs: 23.10 | 63: iteration 4500/ 24424 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 4.36 | learning rate: 1.866E-04 | global batch size: 512 | lm loss: 2.324346E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 117.340 | TFLOPs: 12.08 | 63: iteration 4510/ 24424 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 2.27 | learning rate: 1.865E-04 | global batch size: 512 | lm loss: 2.333504E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.635 | TFLOPs: 23.23 | 63: iteration 4520/ 24424 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 4.16 | learning rate: 1.865E-04 | global batch size: 512 | lm loss: 2.345370E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 122.997 | TFLOPs: 12.66 | 63: iteration 4530/ 24424 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 2.25 | learning rate: 1.864E-04 | global batch size: 512 | lm loss: 2.352847E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.993 | TFLOPs: 23.47 | 63: iteration 4540/ 24424 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 2.24 | learning rate: 1.863E-04 | global batch size: 512 | lm loss: 2.348190E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.795 | TFLOPs: 23.55 | 63: iteration 4550/ 24424 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 2.24 | learning rate: 1.863E-04 | global batch size: 512 | lm loss: 2.332277E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.184 | TFLOPs: 23.49 | 63: iteration 4560/ 24424 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 2.25 | learning rate: 1.862E-04 | global batch size: 512 | lm loss: 2.324496E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.087 | TFLOPs: 23.38 | 63: iteration 4570/ 24424 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 2.24 | learning rate: 1.862E-04 | global batch size: 512 | lm loss: 2.327666E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.227 | TFLOPs: 23.49 | 63: iteration 4580/ 24424 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 2.23 | learning rate: 1.861E-04 | global batch size: 512 | lm loss: 2.340377E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.425 | TFLOPs: 23.62 | 63: iteration 4590/ 24424 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 2.23 | learning rate: 1.860E-04 | global batch size: 512 | lm loss: 2.359194E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.895 | TFLOPs: 23.67 | 63: iteration 4600/ 24424 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 2.25 | learning rate: 1.860E-04 | global batch size: 512 | lm loss: 2.331818E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.969 | TFLOPs: 23.47 | 63: iteration 4610/ 24424 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 2.25 | learning rate: 1.859E-04 | global batch size: 512 | lm loss: 2.300184E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.873 | TFLOPs: 23.46 | 63: iteration 4620/ 24424 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 2.24 | learning rate: 1.858E-04 | global batch size: 512 | lm loss: 2.333238E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.790 | TFLOPs: 23.55 | 63: iteration 4630/ 24424 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 2.24 | learning rate: 1.858E-04 | global batch size: 512 | lm loss: 2.336270E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.388 | TFLOPs: 23.51 | 63: iteration 4640/ 24424 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 2.24 | learning rate: 1.857E-04 | global batch size: 512 | lm loss: 2.339492E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.409 | TFLOPs: 23.51 | 63: iteration 4650/ 24424 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 3.92 | learning rate: 1.857E-04 | global batch size: 512 | lm loss: 2.317494E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 130.685 | TFLOPs: 13.45 | 63: iteration 4660/ 24424 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 2.25 | learning rate: 1.856E-04 | global batch size: 512 | lm loss: 2.325285E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.749 | TFLOPs: 23.45 | 63: iteration 4670/ 24424 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 3.50 | learning rate: 1.855E-04 | global batch size: 512 | lm loss: 2.344816E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 146.371 | TFLOPs: 15.07 | 63: iteration 4680/ 24424 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 2.26 | learning rate: 1.855E-04 | global batch size: 512 | lm loss: 2.315987E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.152 | TFLOPs: 23.28 | 63: iteration 4690/ 24424 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 2.23 | learning rate: 1.854E-04 | global batch size: 512 | lm loss: 2.325190E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.196 | TFLOPs: 23.59 | 63: iteration 4700/ 24424 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 2.26 | learning rate: 1.853E-04 | global batch size: 512 | lm loss: 2.314659E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.229 | TFLOPs: 23.29 | 63: iteration 4710/ 24424 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 2.28 | learning rate: 1.853E-04 | global batch size: 512 | lm loss: 2.312363E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.555 | TFLOPs: 23.12 | 63: iteration 4720/ 24424 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 2.27 | learning rate: 1.852E-04 | global batch size: 512 | lm loss: 2.320283E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.282 | TFLOPs: 23.19 | 63: iteration 4730/ 24424 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 2.24 | learning rate: 1.851E-04 | global batch size: 512 | lm loss: 2.350091E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.987 | TFLOPs: 23.57 | 63: iteration 4740/ 24424 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 2.24 | learning rate: 1.851E-04 | global batch size: 512 | lm loss: 2.316240E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.267 | TFLOPs: 23.50 | 63: iteration 4750/ 24424 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 2.23 | learning rate: 1.850E-04 | global batch size: 512 | lm loss: 2.310583E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.722 | TFLOPs: 23.65 | 63: iteration 4760/ 24424 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 2.23 | learning rate: 1.849E-04 | global batch size: 512 | lm loss: 2.311405E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.371 | TFLOPs: 23.61 | 63: iteration 4770/ 24424 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 2.26 | learning rate: 1.849E-04 | global batch size: 512 | lm loss: 2.338523E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.097 | TFLOPs: 23.28 | 63: iteration 4780/ 24424 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 2.25 | learning rate: 1.848E-04 | global batch size: 512 | lm loss: 2.327018E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.981 | TFLOPs: 23.47 | 63: iteration 4790/ 24424 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 2.24 | learning rate: 1.848E-04 | global batch size: 512 | lm loss: 2.291772E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.014 | TFLOPs: 23.58 | 63: iteration 4800/ 24424 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 2.25 | learning rate: 1.847E-04 | global batch size: 512 | lm loss: 2.297870E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.120 | TFLOPs: 23.38 | 63: iteration 4810/ 24424 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 2.24 | learning rate: 1.846E-04 | global batch size: 512 | lm loss: 2.327029E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.766 | TFLOPs: 23.55 | 63: iteration 4820/ 24424 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 4.00 | learning rate: 1.846E-04 | global batch size: 512 | lm loss: 2.307931E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 127.867 | TFLOPs: 13.16 | 63: iteration 4830/ 24424 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 2.24 | learning rate: 1.845E-04 | global batch size: 512 | lm loss: 2.324565E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.325 | TFLOPs: 23.51 | 63: iteration 4840/ 24424 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 2.26 | learning rate: 1.844E-04 | global batch size: 512 | lm loss: 2.297696E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.452 | TFLOPs: 23.31 | 63: iteration 4850/ 24424 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 2.28 | learning rate: 1.844E-04 | global batch size: 512 | lm loss: 2.305356E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.343 | TFLOPs: 23.10 | 63: iteration 4860/ 24424 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 2.28 | learning rate: 1.843E-04 | global batch size: 512 | lm loss: 2.306402E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.023 | TFLOPs: 23.17 | 63: iteration 4870/ 24424 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 2.24 | learning rate: 1.842E-04 | global batch size: 512 | lm loss: 2.340121E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.676 | TFLOPs: 23.54 | 63: iteration 4880/ 24424 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 2.23 | learning rate: 1.842E-04 | global batch size: 512 | lm loss: 2.299407E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.315 | TFLOPs: 23.61 | 63: iteration 4890/ 24424 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 2.23 | learning rate: 1.841E-04 | global batch size: 512 | lm loss: 2.323074E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.672 | TFLOPs: 23.64 | 63: iteration 4900/ 24424 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 2.25 | learning rate: 1.840E-04 | global batch size: 512 | lm loss: 2.308080E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.649 | TFLOPs: 23.44 | 63: iteration 4910/ 24424 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 2.24 | learning rate: 1.840E-04 | global batch size: 512 | lm loss: 2.303841E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.149 | TFLOPs: 23.49 | 63: iteration 4920/ 24424 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 2.24 | learning rate: 1.839E-04 | global batch size: 512 | lm loss: 2.294131E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.070 | TFLOPs: 23.48 | 63: iteration 4930/ 24424 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 2.23 | learning rate: 1.838E-04 | global batch size: 512 | lm loss: 2.320778E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.115 | TFLOPs: 23.59 | 63: iteration 4940/ 24424 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 2.25 | learning rate: 1.838E-04 | global batch size: 512 | lm loss: 2.288071E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.736 | TFLOPs: 23.44 | 63: iteration 4950/ 24424 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 2.25 | learning rate: 1.837E-04 | global batch size: 512 | lm loss: 2.333799E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.025 | TFLOPs: 23.47 | 63: iteration 4960/ 24424 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 2.30 | learning rate: 1.836E-04 | global batch size: 512 | lm loss: 2.329121E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.434 | TFLOPs: 22.90 | 63: iteration 4970/ 24424 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 2.23 | learning rate: 1.836E-04 | global batch size: 512 | lm loss: 2.300925E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.423 | TFLOPs: 23.62 | 63: iteration 4980/ 24424 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 2.27 | learning rate: 1.835E-04 | global batch size: 512 | lm loss: 2.300546E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.445 | TFLOPs: 23.21 | 63: iteration 4990/ 24424 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 2.25 | learning rate: 1.834E-04 | global batch size: 512 | lm loss: 2.298577E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.721 | TFLOPs: 23.44 | 63: iteration 5000/ 24424 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 2.25 | learning rate: 1.834E-04 | global batch size: 512 | lm loss: 2.325463E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.010 | TFLOPs: 23.47 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 5000 | lm loss value: 2.224554E+00 | lm loss PPL: 9.249353E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 5000 to checkpoints_3b9 0: [2022-11-25 20:50:46,936] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! 0: [2022-11-25 20:50:46,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_01-model_00-model_states.pt... 32: [2022-11-25 20:50:46,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_21-model_00-model_states.pt... 32: [2022-11-25 20:50:47,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_21-model_00-model_states.pt. 32: [2022-11-25 20:50:47,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_22-model_00-model_states.pt... 0: [2022-11-25 20:50:47,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_01-model_00-model_states.pt. 0: [2022-11-25 20:50:47,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_03-model_00-model_states.pt... 32: [2022-11-25 20:50:47,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_22-model_00-model_states.pt. 32: [2022-11-25 20:50:47,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_23-model_00-model_states.pt... 0: [2022-11-25 20:50:47,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_03-model_00-model_states.pt. 0: [2022-11-25 20:50:47,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_04-model_00-model_states.pt... 32: [2022-11-25 20:50:47,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_23-model_00-model_states.pt. 32: [2022-11-25 20:50:47,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_24-model_00-model_states.pt... 0: [2022-11-25 20:50:47,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_04-model_00-model_states.pt. 0: [2022-11-25 20:50:47,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_05-model_00-model_states.pt... 32: [2022-11-25 20:50:47,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_24-model_00-model_states.pt. 32: [2022-11-25 20:50:47,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_25-model_00-model_states.pt... 0: [2022-11-25 20:50:48,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_05-model_00-model_states.pt. 0: [2022-11-25 20:50:48,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_06-model_00-model_states.pt... 32: [2022-11-25 20:50:48,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_25-model_00-model_states.pt. 32: [2022-11-25 20:50:48,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_26-model_00-model_states.pt... 0: [2022-11-25 20:50:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_06-model_00-model_states.pt. 0: [2022-11-25 20:50:48,304] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_07-model_00-model_states.pt... 32: [2022-11-25 20:50:48,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_26-model_00-model_states.pt. 32: [2022-11-25 20:50:48,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_27-model_00-model_states.pt... 0: [2022-11-25 20:50:48,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_07-model_00-model_states.pt. 0: [2022-11-25 20:50:48,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_08-model_00-model_states.pt... 32: [2022-11-25 20:50:48,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_27-model_00-model_states.pt. 32: [2022-11-25 20:50:48,685] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_28-model_00-model_states.pt... 0: [2022-11-25 20:50:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_08-model_00-model_states.pt. 0: [2022-11-25 20:50:48,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_09-model_00-model_states.pt... 32: [2022-11-25 20:50:48,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_28-model_00-model_states.pt. 32: [2022-11-25 20:50:48,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_29-model_00-model_states.pt... 0: [2022-11-25 20:50:48,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_09-model_00-model_states.pt. 0: [2022-11-25 20:50:48,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_10-model_00-model_states.pt... 32: [2022-11-25 20:50:49,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_29-model_00-model_states.pt. 32: [2022-11-25 20:50:49,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_30-model_00-model_states.pt... 0: [2022-11-25 20:50:49,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_10-model_00-model_states.pt. 0: [2022-11-25 20:50:49,225] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_11-model_00-model_states.pt... 32: [2022-11-25 20:50:49,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_30-model_00-model_states.pt. 32: [2022-11-25 20:50:49,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_31-model_00-model_states.pt... 0: [2022-11-25 20:50:49,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_11-model_00-model_states.pt. 0: [2022-11-25 20:50:49,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_12-model_00-model_states.pt... 32: [2022-11-25 20:50:49,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_31-model_00-model_states.pt. 32: [2022-11-25 20:50:49,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_32-model_00-model_states.pt... 0: [2022-11-25 20:50:49,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_12-model_00-model_states.pt. 0: [2022-11-25 20:50:49,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_13-model_00-model_states.pt... 32: [2022-11-25 20:50:49,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_32-model_00-model_states.pt. 32: [2022-11-25 20:50:49,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_33-model_00-model_states.pt... 0: [2022-11-25 20:50:49,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_13-model_00-model_states.pt. 0: [2022-11-25 20:50:49,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_14-model_00-model_states.pt... 32: [2022-11-25 20:50:50,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_33-model_00-model_states.pt. 32: [2022-11-25 20:50:50,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_34-model_00-model_states.pt... 0: [2022-11-25 20:50:50,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_14-model_00-model_states.pt. 0: [2022-11-25 20:50:50,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_15-model_00-model_states.pt... 32: [2022-11-25 20:50:50,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_34-model_00-model_states.pt. 32: [2022-11-25 20:50:50,293] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_35-model_00-model_states.pt... 0: [2022-11-25 20:50:50,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_15-model_00-model_states.pt. 0: [2022-11-25 20:50:50,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_16-model_00-model_states.pt... 32: [2022-11-25 20:50:50,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_35-model_00-model_states.pt. 32: [2022-11-25 20:50:50,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_36-model_00-model_states.pt... 0: [2022-11-25 20:50:50,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_16-model_00-model_states.pt. 0: [2022-11-25 20:50:50,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_17-model_00-model_states.pt... 32: [2022-11-25 20:50:50,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_36-model_00-model_states.pt. 32: [2022-11-25 20:50:50,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_37-model_00-model_states.pt... 0: [2022-11-25 20:50:50,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_17-model_00-model_states.pt. 0: [2022-11-25 20:50:50,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_18-model_00-model_states.pt... 32: [2022-11-25 20:50:50,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_37-model_00-model_states.pt. 32: [2022-11-25 20:50:50,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_38-model_00-model_states.pt... 0: [2022-11-25 20:50:50,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_18-model_00-model_states.pt. 0: [2022-11-25 20:50:50,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_19-model_00-model_states.pt... 32: [2022-11-25 20:50:51,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_38-model_00-model_states.pt. 32: [2022-11-25 20:50:51,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_40-model_00-model_states.pt... 0: [2022-11-25 20:50:51,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_19-model_00-model_states.pt. 0: [2022-11-25 20:50:51,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/layer_20-model_00-model_states.pt... 32: [2022-11-25 20:50:51,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_40-model_00-model_states.pt. 32: [2022-11-25 20:50:51,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/mp_rank_01_model_states.pt... 32: [2022-11-25 20:50:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/mp_rank_01_model_states.pt. 0: [2022-11-25 20:50:51,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/layer_20-model_00-model_states.pt. 0: [2022-11-25 20:50:51,416] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step5000/mp_rank_00_model_states.pt 0: [2022-11-25 20:50:51,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/mp_rank_00_model_states.pt... 0: [2022-11-25 20:50:51,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/mp_rank_00_model_states.pt. 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 35: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 50: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 53: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 61: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 63: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 58: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 62: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 35: [2022-11-25 20:50:51,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 34: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 36: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 47: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 1: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 6: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 42: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 19: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 10: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 23: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 27: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 8: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 3: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 20: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 21: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 16: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 29: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 9: [2022-11-25 20:50:51,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 49: [2022-11-25 20:50:51,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 20:50:51,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 20:50:51,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 20:50:51,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-25 20:50:51,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 20:50:51,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-25 20:50:51,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:51,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 20:50:51,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 20:50:51,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 25: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 25: [2022-11-25 20:50:51,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 20:50:51,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 20:50:51,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 20:50:51,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:51,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:51,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:51,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:51,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:51,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:51,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:51,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:51,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 20: [2022-11-25 20:50:51,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 20:50:51,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:51,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:51,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:51,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 20:50:51,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:51,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:51,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 20:50:51,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 29: [2022-11-25 20:50:51,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:51,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 55: [2022-11-25 20:50:51,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:51,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:51,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:51,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:51,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 7: [2022-11-25 20:50:51,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 7: [2022-11-25 20:50:51,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:51,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 20:50:51,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 20:50:51,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:51,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:51,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:51,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 20:50:51,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 58: [2022-11-25 20:50:51,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 20:50:51,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 9: [2022-11-25 20:50:51,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 58: [2022-11-25 20:50:51,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:51,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 20:50:51,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 20:50:51,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:51,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 5: [2022-11-25 20:50:51,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:51,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 21: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 41: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 20:50:51,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 41: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:51,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-25 20:50:51,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 44: [2022-11-25 20:50:51,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 20:50:51,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:51,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 7: [2022-11-25 20:50:51,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 44: [2022-11-25 20:50:51,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:51,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 20:50:51,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:51,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:51,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:51,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:51,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:51,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 52: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 30: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 30: [2022-11-25 20:50:51,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 52: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 62: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:51,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:51,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 20:50:51,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 20:50:51,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 8: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 11: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 47: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 8: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 47: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 24: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 52: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 22: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 52: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:51,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 42: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:51,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 22: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 20:50:51,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:51,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 46: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:51,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-25 20:50:51,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 20:50:51,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 20:50:51,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:51,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:51,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 35: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 2: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 20:50:51,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:51,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 57: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 15: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:51,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:51,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 20:50:51,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:51,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:51,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 20:50:51,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:51,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:51,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:51,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:51,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:51,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:51,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:51,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:51,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:51,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 20:50:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:51,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:51,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 28: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 20:50:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 20:50:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 20:50:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 20:50:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 20:50:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 20:50:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:51,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:51,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 7: [2022-11-25 20:50:51,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-25 20:50:51,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 20:50:51,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:51,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:51,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 11: [2022-11-25 20:50:51,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:51,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-25 20:50:51,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 20:50:51,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:51,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:51,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 25: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 38: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 25: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 20:50:51,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:51,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:51,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:51,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:51,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:51,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 20:50:51,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 20:50:51,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 20:50:51,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 20:50:51,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 20:50:51,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:51,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:51,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:51,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:51,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:51,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-25 20:50:51,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 20:50:51,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 50: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 16: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:51,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 50: [2022-11-25 20:50:51,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:51,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:51,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:51,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 20:50:51,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 20:50:51,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:51,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:51,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:51,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:51,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:51,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:51,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-25 20:50:51,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 20:50:51,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:51,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:51,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:51,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:51,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:51,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:51,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:51,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:51,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:51,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 20:50:51,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 20:50:51,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 20:50:51,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:51,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:51,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 20:50:51,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:51,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:51,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:51,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 20:50:51,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:51,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 14: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 56: [2022-11-25 20:50:51,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 63: [2022-11-25 20:50:51,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 14: [2022-11-25 20:50:51,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:51,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:51,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:51,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:51,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 25: [2022-11-25 20:50:51,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 52: [2022-11-25 20:50:51,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 25: [2022-11-25 20:50:51,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 52: [2022-11-25 20:50:51,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 20:50:51,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 20:50:51,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 20:50:51,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 10: [2022-11-25 20:50:51,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 61: [2022-11-25 20:50:51,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 20:50:51,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 20:50:51,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:51,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:51,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:51,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:51,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:51,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:51,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 20:50:51,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:51,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 20:50:51,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 20:50:51,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:51,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:51,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:51,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:51,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 20:50:51,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:51,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:51,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 20:50:51,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:51,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:51,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:51,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-25 20:50:51,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:51,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:51,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:51,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-25 20:50:51,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:51,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:51,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:51,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:51,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:51,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:51,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:51,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:51,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:51,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:51,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 20:50:51,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:51,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:51,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:51,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:51,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:51,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 20:50:51,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:51,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:51,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:51,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:51,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:51,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:51,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 26: [2022-11-25 20:50:51,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:51,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-25 20:50:51,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:51,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:51,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:51,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:51,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:51,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:51,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:51,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:51,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:51,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:51,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:51,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 21: [2022-11-25 20:50:51,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 58: [2022-11-25 20:50:51,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 21: [2022-11-25 20:50:51,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 58: [2022-11-25 20:50:51,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 11: [2022-11-25 20:50:51,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 63: [2022-11-25 20:50:51,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:51,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:51,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:51,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:51,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:51,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 20:50:51,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:51,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 20:50:51,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 53: [2022-11-25 20:50:51,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 7: [2022-11-25 20:50:51,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:51,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:51,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-25 20:50:51,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:51,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:51,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 40: [2022-11-25 20:50:51,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:51,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 12: [2022-11-25 20:50:51,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:51,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:51,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:51,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 38: [2022-11-25 20:50:51,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 0: [2022-11-25 20:50:51,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 20:50:51,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 20:50:51,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-25 20:50:51,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 50: [2022-11-25 20:50:51,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 20: [2022-11-25 20:50:51,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 20:50:51,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 20:50:51,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:51,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:51,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:51,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:51,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:51,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 20:50:51,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 20:50:51,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:51,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 20:50:51,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 20:50:51,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 20:50:51,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:51,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 20:50:51,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:51,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:51,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:51,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:51,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:51,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:51,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:51,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:51,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:51,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:51,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 20:50:51,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:51,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:51,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-25 20:50:51,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-25 20:50:51,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 39: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 27: [2022-11-25 20:50:51,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:51,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:51,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:51,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:51,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 20:50:51,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:51,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:51,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 20:50:51,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:51,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 20:50:51,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 20:50:51,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 20:50:51,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-25 20:50:51,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-25 20:50:51,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:51,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:51,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 35: [2022-11-25 20:50:51,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 8: [2022-11-25 20:50:51,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 20:50:51,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:51,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:51,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:51,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:51,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:51,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:51,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 20:50:51,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:51,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:51,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:51,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:51,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 20:50:51,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 20:50:51,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 20:50:51,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 15: [2022-11-25 20:50:51,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 62: [2022-11-25 20:50:51,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:51,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:51,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:51,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:51,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:51,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 20:50:51,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 20:50:51,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:51,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 20:50:51,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 5: [2022-11-25 20:50:51,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:51,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:51,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:51,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:51,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:51,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:51,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:51,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:51,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:51,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:51,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:51,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:51,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 6: [2022-11-25 20:50:51,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 61: [2022-11-25 20:50:51,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 20: [2022-11-25 20:50:51,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 6: [2022-11-25 20:50:51,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 61: [2022-11-25 20:50:51,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 6: [2022-11-25 20:50:51,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 20:50:51,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-25 20:50:51,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-25 20:50:51,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:51,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:51,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:51,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:51,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 20:50:51,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-25 20:50:51,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 20:50:51,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:51,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:51,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-25 20:50:51,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 52: [2022-11-25 20:50:51,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:51,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:51,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 41: [2022-11-25 20:50:51,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-25 20:50:51,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:51,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 20:50:51,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:51,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 20:50:51,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 20:50:51,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 20:50:51,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:51,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 20:50:51,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 20:50:51,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:51,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:51,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:51,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:51,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:51,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:51,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 20:50:51,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-25 20:50:51,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-25 20:50:51,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:51,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:51,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 20:50:51,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:51,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:51,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:51,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 20:50:51,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 20:50:51,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 20:50:51,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 20:50:51,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-25 20:50:51,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 20:50:51,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:51,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 20:50:51,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 20:50:51,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:51,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:51,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:51,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 39: [2022-11-25 20:50:51,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 29: [2022-11-25 20:50:51,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:51,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 50: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:51,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:51,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:51,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:51,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 20:50:51,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:51,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:51,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 20:50:51,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 20:50:51,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-25 20:50:51,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 20:50:51,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 20:50:51,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 20:50:51,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:51,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 20:50:51,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-25 20:50:51,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:51,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:51,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 20:50:51,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:51,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:51,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:51,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 20:50:51,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:51,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:51,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:51,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:51,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:51,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:51,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:51,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:51,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:51,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:51,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:51,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:51,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:51,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 20:50:51,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 20:50:51,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 20:50:51,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 20:50:51,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 20:50:51,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 20:50:51,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:51,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:51,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-25 20:50:51,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:51,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 20:50:51,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:51,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 20:50:51,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 20:50:51,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 20:50:51,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:51,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:51,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:51,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:51,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:51,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 61: [2022-11-25 20:50:51,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 7: [2022-11-25 20:50:51,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:51,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 61: [2022-11-25 20:50:51,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 7: [2022-11-25 20:50:51,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 20:50:51,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:51,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:51,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 38: [2022-11-25 20:50:51,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 31: [2022-11-25 20:50:51,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:51,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 20:50:51,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 20:50:51,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 20:50:51,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 20:50:51,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:51,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:51,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 20:50:51,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 20:50:51,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 20:50:51,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 20:50:51,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:51,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:51,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:51,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:51,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:51,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:51,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:51,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:51,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:51,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 20:50:51,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-25 20:50:51,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 20:50:51,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:51,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:51,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:51,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:51,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 20:50:51,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 20:50:51,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 20:50:51,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 20:50:51,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 20:50:51,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:51,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:51,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:51,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:51,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:51,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:51,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:51,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:51,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:51,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:51,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:51,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:51,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 20:50:51,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 20:50:51,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:51,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 20:50:51,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 20:50:51,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:51,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 20:50:51,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:51,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:51,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 20:50:51,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 20:50:51,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 20:50:51,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 20:50:51,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:51,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:51,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:51,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:51,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:51,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 20:50:51,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:51,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 20:50:51,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:51,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:51,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:51,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:52,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 20:50:52,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:52,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 20:50:52,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:52,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:52,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:52,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:52,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:52,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:52,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:52,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:52,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:52,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-25 20:50:52,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:52,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 20:50:52,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:52,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:52,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:52,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 20:50:52,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 20:50:52,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:52,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 20:50:52,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 20:50:52,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:52,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:52,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:52,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:52,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 1: [2022-11-25 20:50:52,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:52,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:52,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:52,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:52,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 20:50:52,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:52,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:52,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:52,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 20:50:52,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 20:50:52,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:52,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:52,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:52,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:52,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:52,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:52,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:52,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:52,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:52,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:52,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-25 20:50:52,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-25 20:50:52,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:52,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 20:50:52,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:52,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:52,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:52,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:52,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 20:50:52,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:52,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:52,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:52,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:52,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:52,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:52,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 20:50:52,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:52,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 20:50:52,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:52,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 33: [2022-11-25 20:50:52,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-25 20:50:52,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:52,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-25 20:50:52,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:52,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:52,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 20:50:52,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:52,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:52,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-25 20:50:52,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-25 20:50:52,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 58: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 2: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 1: [2022-11-25 20:50:52,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 2: [2022-11-25 20:50:52,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 1: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:52,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 2: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 20:50:52,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 20:50:52,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-25 20:50:52,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 20:50:52,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 20:50:52,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 9: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 47: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:52,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 20:50:52,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 20:50:52,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 20:50:52,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:52,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:52,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 12: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 31: [2022-11-25 20:50:52,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 61: [2022-11-25 20:50:52,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:52,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:52,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 60: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:52,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 60: [2022-11-25 20:50:52,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 20:50:52,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 20:50:52,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 20:50:52,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-25 20:50:52,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-25 20:50:52,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 20:50:52,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 20:50:52,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 20:50:52,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:52,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:52,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 20:50:52,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 20:50:52,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 20:50:52,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 20:50:52,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 20:50:52,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 20:50:52,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 20:50:52,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 39: [2022-11-25 20:50:52,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 20: [2022-11-25 20:50:52,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-25 20:50:52,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:52,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 20: [2022-11-25 20:50:52,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 20:50:52,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 20:50:52,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 20:50:52,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 20:50:52,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 20:50:52,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 20:50:52,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 20:50:52,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 20:50:52,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-25 20:50:52,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 26: [2022-11-25 20:50:52,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 57: [2022-11-25 20:50:52,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:52,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:52,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-25 20:50:52,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-25 20:50:52,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 26: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:52,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 26: [2022-11-25 20:50:52,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 20:50:52,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 20:50:52,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 20:50:52,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 20:50:52,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 20:50:52,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 20:50:52,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 20:50:52,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 20:50:52,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 20:50:52,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 20:50:52,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:52,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:52,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:52,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 20:50:52,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 20:50:52,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 20:50:52,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 20:50:52,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-25 20:50:52,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 20:50:52,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 20:50:52,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-25 20:50:52,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 20:50:52,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 20:50:52,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-25 20:50:52,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-25 20:50:52,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 20:50:52,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 20:50:52,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 20:50:52,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:52,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:52,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:52,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:52,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:52,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:52,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:52,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:52,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:52,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 20:50:52,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-25 20:50:52,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 20:50:52,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 20:50:52,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 20:50:52,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 20:50:52,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 20:50:52,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 20:50:52,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 20:50:52,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 20:50:52,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 20:50:52,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 20:50:52,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 20:50:52,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-25 20:50:52,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 20:50:52,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 20:50:52,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 20:50:52,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 20:50:52,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: successfully saved checkpoint at iteration 5000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5156.93 63: iteration 5010/ 24424 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 2.81 | learning rate: 1.833E-04 | global batch size: 512 | lm loss: 2.311485E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.952 | TFLOPs: 18.73 | 63: iteration 5020/ 24424 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 2.23 | learning rate: 1.832E-04 | global batch size: 512 | lm loss: 2.324175E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.234 | TFLOPs: 23.60 | 63: iteration 5030/ 24424 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 2.23 | learning rate: 1.832E-04 | global batch size: 512 | lm loss: 2.305317E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.717 | TFLOPs: 23.65 | 63: iteration 5040/ 24424 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 2.27 | learning rate: 1.831E-04 | global batch size: 512 | lm loss: 2.289958E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.981 | TFLOPs: 23.26 | 63: iteration 5050/ 24424 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 2.24 | learning rate: 1.830E-04 | global batch size: 512 | lm loss: 2.298910E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.345 | TFLOPs: 23.51 | 63: iteration 5060/ 24424 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 2.24 | learning rate: 1.830E-04 | global batch size: 512 | lm loss: 2.281817E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.168 | TFLOPs: 23.49 | 63: iteration 5070/ 24424 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 2.24 | learning rate: 1.829E-04 | global batch size: 512 | lm loss: 2.306354E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.179 | TFLOPs: 23.49 | 63: iteration 5080/ 24424 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 2.23 | learning rate: 1.828E-04 | global batch size: 512 | lm loss: 2.289299E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.128 | TFLOPs: 23.59 | 63: iteration 5090/ 24424 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 2.24 | learning rate: 1.827E-04 | global batch size: 512 | lm loss: 2.307593E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.917 | TFLOPs: 23.57 | 63: iteration 5100/ 24424 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 2.26 | learning rate: 1.827E-04 | global batch size: 512 | lm loss: 2.299571E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.966 | TFLOPs: 23.37 | 63: iteration 5110/ 24424 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 2.46 | learning rate: 1.826E-04 | global batch size: 512 | lm loss: 2.286797E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 208.170 | TFLOPs: 21.43 | 63: iteration 5120/ 24424 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 2.26 | learning rate: 1.825E-04 | global batch size: 512 | lm loss: 2.300719E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.722 | TFLOPs: 23.34 | 63: iteration 5130/ 24424 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 3.39 | learning rate: 1.825E-04 | global batch size: 512 | lm loss: 2.287389E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 150.813 | TFLOPs: 15.53 | 63: iteration 5140/ 24424 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 2.24 | learning rate: 1.824E-04 | global batch size: 512 | lm loss: 2.300272E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.117 | TFLOPs: 23.48 | 63: iteration 5150/ 24424 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 2.24 | learning rate: 1.823E-04 | global batch size: 512 | lm loss: 2.295477E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.238 | TFLOPs: 23.50 | 63: iteration 5160/ 24424 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 2.24 | learning rate: 1.823E-04 | global batch size: 512 | lm loss: 2.311762E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.200 | TFLOPs: 23.49 | 63: iteration 5170/ 24424 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 2.25 | learning rate: 1.822E-04 | global batch size: 512 | lm loss: 2.300920E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.473 | TFLOPs: 23.42 | 63: iteration 5180/ 24424 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 2.27 | learning rate: 1.821E-04 | global batch size: 512 | lm loss: 2.296994E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.716 | TFLOPs: 23.24 | 63: iteration 5190/ 24424 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 2.23 | learning rate: 1.821E-04 | global batch size: 512 | lm loss: 2.291941E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.307 | TFLOPs: 23.61 | 63: iteration 5200/ 24424 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 2.24 | learning rate: 1.820E-04 | global batch size: 512 | lm loss: 2.285115E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.918 | TFLOPs: 23.57 | 63: iteration 5210/ 24424 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 2.26 | learning rate: 1.819E-04 | global batch size: 512 | lm loss: 2.295046E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.889 | TFLOPs: 23.36 | 63: iteration 5220/ 24424 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 2.24 | learning rate: 1.818E-04 | global batch size: 512 | lm loss: 2.296385E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.945 | TFLOPs: 23.57 | 63: iteration 5230/ 24424 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 2.24 | learning rate: 1.818E-04 | global batch size: 512 | lm loss: 2.282247E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.907 | TFLOPs: 23.56 | 63: iteration 5240/ 24424 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 2.24 | learning rate: 1.817E-04 | global batch size: 512 | lm loss: 2.287282E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.077 | TFLOPs: 23.48 | 63: iteration 5250/ 24424 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 2.24 | learning rate: 1.816E-04 | global batch size: 512 | lm loss: 2.282344E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.896 | TFLOPs: 23.56 | 63: iteration 5260/ 24424 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 2.26 | learning rate: 1.816E-04 | global batch size: 512 | lm loss: 2.314338E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.876 | TFLOPs: 23.36 | 63: iteration 5270/ 24424 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 2.23 | learning rate: 1.815E-04 | global batch size: 512 | lm loss: 2.283337E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.355 | TFLOPs: 23.61 | 63: iteration 5280/ 24424 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 2.23 | learning rate: 1.814E-04 | global batch size: 512 | lm loss: 2.273251E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.521 | TFLOPs: 23.63 | 63: iteration 5290/ 24424 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 3.59 | learning rate: 1.813E-04 | global batch size: 512 | lm loss: 2.276863E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 142.489 | TFLOPs: 14.67 | 63: iteration 5300/ 24424 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 2.23 | learning rate: 1.813E-04 | global batch size: 512 | lm loss: 2.294183E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.515 | TFLOPs: 23.63 | 63: iteration 5310/ 24424 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 2.24 | learning rate: 1.812E-04 | global batch size: 512 | lm loss: 2.265916E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.410 | TFLOPs: 23.51 | 63: iteration 5320/ 24424 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 2.26 | learning rate: 1.811E-04 | global batch size: 512 | lm loss: 2.272833E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.004 | TFLOPs: 23.37 | 63: iteration 5330/ 24424 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 2.26 | learning rate: 1.811E-04 | global batch size: 512 | lm loss: 2.288795E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.909 | TFLOPs: 23.36 | 63: iteration 5340/ 24424 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 2.27 | learning rate: 1.810E-04 | global batch size: 512 | lm loss: 2.291161E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.229 | TFLOPs: 23.19 | 63: iteration 5350/ 24424 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 2.23 | learning rate: 1.809E-04 | global batch size: 512 | lm loss: 2.288474E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.560 | TFLOPs: 23.63 | 63: iteration 5360/ 24424 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 2.23 | learning rate: 1.808E-04 | global batch size: 512 | lm loss: 2.305259E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.519 | TFLOPs: 23.63 | 63: iteration 5370/ 24424 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 2.27 | learning rate: 1.808E-04 | global batch size: 512 | lm loss: 2.268422E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.974 | TFLOPs: 23.26 | 63: iteration 5380/ 24424 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 2.27 | learning rate: 1.807E-04 | global batch size: 512 | lm loss: 2.285222E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.867 | TFLOPs: 23.25 | 63: iteration 5390/ 24424 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 2.27 | learning rate: 1.806E-04 | global batch size: 512 | lm loss: 2.248138E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.064 | TFLOPs: 23.17 | 63: iteration 5400/ 24424 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 2.26 | learning rate: 1.806E-04 | global batch size: 512 | lm loss: 2.272227E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.994 | TFLOPs: 23.37 | 63: iteration 5410/ 24424 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 2.23 | learning rate: 1.805E-04 | global batch size: 512 | lm loss: 2.268186E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.349 | TFLOPs: 23.61 | 63: iteration 5420/ 24424 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 2.34 | learning rate: 1.804E-04 | global batch size: 512 | lm loss: 2.274549E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.489 | TFLOPs: 22.49 | 63: iteration 5430/ 24424 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 2.23 | learning rate: 1.803E-04 | global batch size: 512 | lm loss: 2.271010E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.141 | TFLOPs: 23.59 | 63: iteration 5440/ 24424 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 3.75 | learning rate: 1.803E-04 | global batch size: 512 | lm loss: 2.287144E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.498 | TFLOPs: 14.05 | 63: iteration 5450/ 24424 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 2.25 | learning rate: 1.802E-04 | global batch size: 512 | lm loss: 2.271383E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.874 | TFLOPs: 23.46 | 63: iteration 5460/ 24424 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 2.27 | learning rate: 1.801E-04 | global batch size: 512 | lm loss: 3.921776E+00 | grad norm: 38.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.080 | TFLOPs: 23.17 | 63: iteration 5470/ 24424 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 2.27 | learning rate: 1.800E-04 | global batch size: 512 | lm loss: 9.480075E+00 | grad norm: 6.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.391 | TFLOPs: 23.20 | 63: iteration 5480/ 24424 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 2.43 | learning rate: 1.800E-04 | global batch size: 512 | lm loss: 8.203741E+00 | grad norm: 1.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 210.348 | TFLOPs: 21.65 | 63: iteration 5490/ 24424 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 2.31 | learning rate: 1.799E-04 | global batch size: 512 | lm loss: 6.982838E+00 | grad norm: 1.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.013 | TFLOPs: 22.86 | 63: iteration 5500/ 24424 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 2.31 | learning rate: 1.798E-04 | global batch size: 512 | lm loss: 6.468295E+00 | grad norm: 1.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.731 | TFLOPs: 22.83 | 63: iteration 5510/ 24424 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 2.29 | learning rate: 1.797E-04 | global batch size: 512 | lm loss: 6.184678E+00 | grad norm: 3.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.804 | TFLOPs: 23.04 | 63: iteration 5520/ 24424 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 2.27 | learning rate: 1.797E-04 | global batch size: 512 | lm loss: 5.872264E+00 | grad norm: 1.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.923 | TFLOPs: 23.26 | 63: iteration 5530/ 24424 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 2.28 | learning rate: 1.796E-04 | global batch size: 512 | lm loss: 5.528287E+00 | grad norm: 1.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.907 | TFLOPs: 23.15 | 63: iteration 5540/ 24424 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 2.27 | learning rate: 1.795E-04 | global batch size: 512 | lm loss: 5.139630E+00 | grad norm: 2.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.219 | TFLOPs: 23.19 | 63: iteration 5550/ 24424 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 2.25 | learning rate: 1.795E-04 | global batch size: 512 | lm loss: 4.221673E+00 | grad norm: 2.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.496 | TFLOPs: 23.42 | 63: iteration 5560/ 24424 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 2.30 | learning rate: 1.794E-04 | global batch size: 512 | lm loss: 3.650353E+00 | grad norm: 1.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.749 | TFLOPs: 22.93 | 63: iteration 5570/ 24424 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 2.26 | learning rate: 1.793E-04 | global batch size: 512 | lm loss: 3.135582E+00 | grad norm: 0.759 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.296 | TFLOPs: 23.30 | 63: iteration 5580/ 24424 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 2.32 | learning rate: 1.792E-04 | global batch size: 512 | lm loss: 2.895886E+00 | grad norm: 0.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.674 | TFLOPs: 22.72 | 63: iteration 5590/ 24424 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 2.24 | learning rate: 1.792E-04 | global batch size: 512 | lm loss: 2.705232E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.418 | TFLOPs: 23.51 | 63: iteration 5600/ 24424 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 2.26 | learning rate: 1.791E-04 | global batch size: 512 | lm loss: 2.593286E+00 | grad norm: 1.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.418 | TFLOPs: 23.31 | 63: iteration 5610/ 24424 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 2.23 | learning rate: 1.790E-04 | global batch size: 512 | lm loss: 2.627152E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.396 | TFLOPs: 23.62 | 63: iteration 5620/ 24424 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 2.26 | learning rate: 1.789E-04 | global batch size: 512 | lm loss: 2.565838E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.924 | TFLOPs: 23.36 | 63: iteration 5630/ 24424 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 2.26 | learning rate: 1.789E-04 | global batch size: 512 | lm loss: 2.476192E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.781 | TFLOPs: 23.35 | 63: iteration 5640/ 24424 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 2.26 | learning rate: 1.788E-04 | global batch size: 512 | lm loss: 2.439689E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.206 | TFLOPs: 23.29 | 63: iteration 5650/ 24424 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 2.26 | learning rate: 1.787E-04 | global batch size: 512 | lm loss: 2.374320E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.512 | TFLOPs: 23.32 | 63: iteration 5660/ 24424 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 2.32 | learning rate: 1.786E-04 | global batch size: 512 | lm loss: 2.358750E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.254 | TFLOPs: 22.67 | 63: iteration 5670/ 24424 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 2.25 | learning rate: 1.785E-04 | global batch size: 512 | lm loss: 2.349237E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.990 | TFLOPs: 23.47 | 63: iteration 5680/ 24424 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 2.24 | learning rate: 1.785E-04 | global batch size: 512 | lm loss: 2.360449E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.414 | TFLOPs: 23.51 | 63: iteration 5690/ 24424 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 2.23 | learning rate: 1.784E-04 | global batch size: 512 | lm loss: 2.336780E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.731 | TFLOPs: 23.65 | 63: iteration 5700/ 24424 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 2.23 | learning rate: 1.783E-04 | global batch size: 512 | lm loss: 2.340624E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.795 | TFLOPs: 23.66 | 63: iteration 5710/ 24424 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 2.25 | learning rate: 1.782E-04 | global batch size: 512 | lm loss: 2.296270E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.933 | TFLOPs: 23.46 | 63: iteration 5720/ 24424 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 2.25 | learning rate: 1.782E-04 | global batch size: 512 | lm loss: 2.324936E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.474 | TFLOPs: 23.42 | 63: iteration 5730/ 24424 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 2.26 | learning rate: 1.781E-04 | global batch size: 512 | lm loss: 2.305534E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.533 | TFLOPs: 23.32 | 63: iteration 5740/ 24424 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 2.24 | learning rate: 1.780E-04 | global batch size: 512 | lm loss: 2.332770E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.304 | TFLOPs: 23.50 | 63: iteration 5750/ 24424 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 2.24 | learning rate: 1.779E-04 | global batch size: 512 | lm loss: 2.326590E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.609 | TFLOPs: 23.53 | 63: iteration 5760/ 24424 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 2.30 | learning rate: 1.779E-04 | global batch size: 512 | lm loss: 2.318637E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.807 | TFLOPs: 22.94 | 63: iteration 5770/ 24424 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 2.24 | learning rate: 1.778E-04 | global batch size: 512 | lm loss: 2.299492E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.756 | TFLOPs: 23.55 | 63: iteration 5780/ 24424 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 2.23 | learning rate: 1.777E-04 | global batch size: 512 | lm loss: 2.316279E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.807 | TFLOPs: 23.66 | 63: iteration 5790/ 24424 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 2.25 | learning rate: 1.776E-04 | global batch size: 512 | lm loss: 2.285260E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.941 | TFLOPs: 23.47 | 63: iteration 5800/ 24424 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 2.23 | learning rate: 1.776E-04 | global batch size: 512 | lm loss: 2.308458E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.152 | TFLOPs: 23.59 | 63: iteration 5810/ 24424 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 2.28 | learning rate: 1.775E-04 | global batch size: 512 | lm loss: 2.319502E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.640 | TFLOPs: 23.13 | 63: iteration 5820/ 24424 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 2.24 | learning rate: 1.774E-04 | global batch size: 512 | lm loss: 2.300958E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.727 | TFLOPs: 23.55 | 63: iteration 5830/ 24424 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 2.26 | learning rate: 1.773E-04 | global batch size: 512 | lm loss: 2.285460E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.405 | TFLOPs: 23.31 | 63: iteration 5840/ 24424 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 2.29 | learning rate: 1.772E-04 | global batch size: 512 | lm loss: 2.294473E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.873 | TFLOPs: 23.05 | 63: iteration 5850/ 24424 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 2.25 | learning rate: 1.772E-04 | global batch size: 512 | lm loss: 2.288968E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.837 | TFLOPs: 23.45 | 63: iteration 5860/ 24424 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 2.25 | learning rate: 1.771E-04 | global batch size: 512 | lm loss: 2.288002E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.168 | TFLOPs: 23.39 | 63: iteration 5870/ 24424 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 2.24 | learning rate: 1.770E-04 | global batch size: 512 | lm loss: 2.285548E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.081 | TFLOPs: 23.48 | 63: iteration 5880/ 24424 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 2.26 | learning rate: 1.769E-04 | global batch size: 512 | lm loss: 2.279033E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.719 | TFLOPs: 23.34 | 63: iteration 5890/ 24424 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 2.23 | learning rate: 1.769E-04 | global batch size: 512 | lm loss: 2.285263E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.664 | TFLOPs: 23.64 | 63: iteration 5900/ 24424 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 2.24 | learning rate: 1.768E-04 | global batch size: 512 | lm loss: 2.281178E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.006 | TFLOPs: 23.58 | 63: iteration 5910/ 24424 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 2.23 | learning rate: 1.767E-04 | global batch size: 512 | lm loss: 2.276953E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.436 | TFLOPs: 23.62 | 63: iteration 5920/ 24424 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 2.57 | learning rate: 1.766E-04 | global batch size: 512 | lm loss: 2.246842E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 198.919 | TFLOPs: 20.48 | 63: iteration 5930/ 24424 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 2.28 | learning rate: 1.765E-04 | global batch size: 512 | lm loss: 2.265014E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.623 | TFLOPs: 23.12 | 63: iteration 5940/ 24424 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 2.23 | learning rate: 1.765E-04 | global batch size: 512 | lm loss: 2.276691E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.726 | TFLOPs: 23.65 | 63: iteration 5950/ 24424 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 2.24 | learning rate: 1.764E-04 | global batch size: 512 | lm loss: 2.268649E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.729 | TFLOPs: 23.55 | 63: iteration 5960/ 24424 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 2.26 | learning rate: 1.763E-04 | global batch size: 512 | lm loss: 2.301990E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.767 | TFLOPs: 23.34 | 63: iteration 5970/ 24424 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 2.24 | learning rate: 1.762E-04 | global batch size: 512 | lm loss: 2.295772E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.140 | TFLOPs: 23.49 | 63: iteration 5980/ 24424 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 2.23 | learning rate: 1.761E-04 | global batch size: 512 | lm loss: 2.255342E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.338 | TFLOPs: 23.61 | 63: iteration 5990/ 24424 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 2.27 | learning rate: 1.761E-04 | global batch size: 512 | lm loss: 2.287530E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.794 | TFLOPs: 23.24 | 0: [2022-11-25 21:29:12,572] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.00017598672432253654, 0.00017598672432253654, 0.00017598672432253654], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 6000/ 24424 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 2.25 | learning rate: 1.760E-04 | global batch size: 512 | lm loss: 2.286469E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.771 | TFLOPs: 23.45 | 0: steps: 6000 loss: 2.3303 iter time (s): 3.473 samples/sec: 147.404 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 6000 | lm loss value: 2.239298E+00 | lm loss PPL: 9.386736E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 6000 to checkpoints_3b9 0: [2022-11-25 21:29:13,317] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! 0: [2022-11-25 21:29:13,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_01-model_00-model_states.pt... 32: [2022-11-25 21:29:13,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_21-model_00-model_states.pt... 32: [2022-11-25 21:29:13,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_21-model_00-model_states.pt. 32: [2022-11-25 21:29:13,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_22-model_00-model_states.pt... 0: [2022-11-25 21:29:13,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_01-model_00-model_states.pt. 0: [2022-11-25 21:29:13,694] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_03-model_00-model_states.pt... 32: [2022-11-25 21:29:13,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_22-model_00-model_states.pt. 32: [2022-11-25 21:29:13,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_23-model_00-model_states.pt... 0: [2022-11-25 21:29:13,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_03-model_00-model_states.pt. 0: [2022-11-25 21:29:13,935] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_04-model_00-model_states.pt... 32: [2022-11-25 21:29:13,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_23-model_00-model_states.pt. 32: [2022-11-25 21:29:13,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_24-model_00-model_states.pt... 0: [2022-11-25 21:29:14,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_04-model_00-model_states.pt. 0: [2022-11-25 21:29:14,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_05-model_00-model_states.pt... 32: [2022-11-25 21:29:14,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_24-model_00-model_states.pt. 32: [2022-11-25 21:29:14,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_25-model_00-model_states.pt... 32: [2022-11-25 21:29:14,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_25-model_00-model_states.pt. 32: [2022-11-25 21:29:14,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_26-model_00-model_states.pt... 0: [2022-11-25 21:29:14,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_05-model_00-model_states.pt. 0: [2022-11-25 21:29:14,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_06-model_00-model_states.pt... 32: [2022-11-25 21:29:14,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_26-model_00-model_states.pt. 32: [2022-11-25 21:29:14,626] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_27-model_00-model_states.pt... 0: [2022-11-25 21:29:14,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_06-model_00-model_states.pt. 0: [2022-11-25 21:29:14,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_07-model_00-model_states.pt... 32: [2022-11-25 21:29:14,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_27-model_00-model_states.pt. 32: [2022-11-25 21:29:14,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_28-model_00-model_states.pt... 0: [2022-11-25 21:29:14,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_07-model_00-model_states.pt. 0: [2022-11-25 21:29:14,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_08-model_00-model_states.pt... 32: [2022-11-25 21:29:15,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_28-model_00-model_states.pt. 32: [2022-11-25 21:29:15,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_29-model_00-model_states.pt... 0: [2022-11-25 21:29:15,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_08-model_00-model_states.pt. 0: [2022-11-25 21:29:15,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_09-model_00-model_states.pt... 32: [2022-11-25 21:29:15,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_29-model_00-model_states.pt. 32: [2022-11-25 21:29:15,292] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_30-model_00-model_states.pt... 0: [2022-11-25 21:29:15,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_09-model_00-model_states.pt. 0: [2022-11-25 21:29:15,295] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_10-model_00-model_states.pt... 32: [2022-11-25 21:29:15,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_30-model_00-model_states.pt. 32: [2022-11-25 21:29:15,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_31-model_00-model_states.pt... 0: [2022-11-25 21:29:15,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_10-model_00-model_states.pt. 0: [2022-11-25 21:29:15,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_11-model_00-model_states.pt... 32: [2022-11-25 21:29:15,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_31-model_00-model_states.pt. 32: [2022-11-25 21:29:15,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_32-model_00-model_states.pt... 0: [2022-11-25 21:29:15,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_11-model_00-model_states.pt. 0: [2022-11-25 21:29:15,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_12-model_00-model_states.pt... 32: [2022-11-25 21:29:15,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_32-model_00-model_states.pt. 32: [2022-11-25 21:29:15,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_33-model_00-model_states.pt... 0: [2022-11-25 21:29:15,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_12-model_00-model_states.pt. 0: [2022-11-25 21:29:15,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_13-model_00-model_states.pt... 32: [2022-11-25 21:29:16,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_33-model_00-model_states.pt. 32: [2022-11-25 21:29:16,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_34-model_00-model_states.pt... 0: [2022-11-25 21:29:16,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_13-model_00-model_states.pt. 0: [2022-11-25 21:29:16,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_14-model_00-model_states.pt... 32: [2022-11-25 21:29:16,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_34-model_00-model_states.pt. 32: [2022-11-25 21:29:16,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_35-model_00-model_states.pt... 0: [2022-11-25 21:29:16,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_14-model_00-model_states.pt. 0: [2022-11-25 21:29:16,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_15-model_00-model_states.pt... 32: [2022-11-25 21:29:16,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_35-model_00-model_states.pt. 32: [2022-11-25 21:29:16,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_36-model_00-model_states.pt... 0: [2022-11-25 21:29:16,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_15-model_00-model_states.pt. 0: [2022-11-25 21:29:16,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_16-model_00-model_states.pt... 32: [2022-11-25 21:29:16,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_36-model_00-model_states.pt. 32: [2022-11-25 21:29:16,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_37-model_00-model_states.pt... 0: [2022-11-25 21:29:16,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_16-model_00-model_states.pt. 0: [2022-11-25 21:29:16,831] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_17-model_00-model_states.pt... 32: [2022-11-25 21:29:17,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_37-model_00-model_states.pt. 32: [2022-11-25 21:29:17,014] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_38-model_00-model_states.pt... 0: [2022-11-25 21:29:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_17-model_00-model_states.pt. 0: [2022-11-25 21:29:17,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_18-model_00-model_states.pt... 32: [2022-11-25 21:29:17,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_38-model_00-model_states.pt. 32: [2022-11-25 21:29:17,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_40-model_00-model_states.pt... 32: [2022-11-25 21:29:17,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_40-model_00-model_states.pt. 32: [2022-11-25 21:29:17,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/mp_rank_01_model_states.pt... 32: [2022-11-25 21:29:17,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/mp_rank_01_model_states.pt. 0: [2022-11-25 21:29:17,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_18-model_00-model_states.pt. 0: [2022-11-25 21:29:17,455] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_19-model_00-model_states.pt... 0: [2022-11-25 21:29:17,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_19-model_00-model_states.pt. 0: [2022-11-25 21:29:17,767] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/layer_20-model_00-model_states.pt... 0: [2022-11-25 21:29:18,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/layer_20-model_00-model_states.pt. 0: [2022-11-25 21:29:18,084] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step6000/mp_rank_00_model_states.pt 0: [2022-11-25 21:29:18,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/mp_rank_00_model_states.pt... 0: [2022-11-25 21:29:18,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/mp_rank_00_model_states.pt. 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 61: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 56: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 23: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 33: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 3: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 40: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 49: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 45: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 18: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 27: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 5: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 12: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 19: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 0: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 29: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 30: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 6: [2022-11-25 21:29:18,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 9: [2022-11-25 21:29:18,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 21:29:18,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 21:29:18,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 21:29:18,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 21:29:18,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 57: [2022-11-25 21:29:18,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 57: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 22: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 57: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 24: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 27: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 56: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 4: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 0: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 33: [2022-11-25 21:29:18,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 44: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-25 21:29:18,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 21:29:18,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 21:29:18,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 21:29:18,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 21:29:18,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 43: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 43: [2022-11-25 21:29:18,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 34: [2022-11-25 21:29:18,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 6: [2022-11-25 21:29:18,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 34: [2022-11-25 21:29:18,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 10: [2022-11-25 21:29:18,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 34: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 21:29:18,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 5: [2022-11-25 21:29:18,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 21:29:18,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 13: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 21:29:18,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 34: [2022-11-25 21:29:18,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 21:29:18,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 21:29:18,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 21:29:18,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 31: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 1: [2022-11-25 21:29:18,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 21:29:18,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 25: [2022-11-25 21:29:18,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 49: [2022-11-25 21:29:18,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 21:29:18,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 21:29:18,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-25 21:29:18,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-25 21:29:18,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 21:29:18,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 32: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 45: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 19: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 45: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 19: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 34: [2022-11-25 21:29:18,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 43: [2022-11-25 21:29:18,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 21:29:18,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 5: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-25 21:29:18,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 62: [2022-11-25 21:29:18,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 61: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 21:29:18,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 21:29:18,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 12: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 57: [2022-11-25 21:29:18,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 12: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 21:29:18,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 32: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 32: [2022-11-25 21:29:18,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 1: [2022-11-25 21:29:18,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 3: [2022-11-25 21:29:18,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 42: [2022-11-25 21:29:18,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 3: [2022-11-25 21:29:18,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 42: [2022-11-25 21:29:18,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 5: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 53: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 7: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 55: [2022-11-25 21:29:18,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 7: [2022-11-25 21:29:18,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 21:29:18,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 21:29:18,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 43: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 14: [2022-11-25 21:29:18,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 43: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 21:29:18,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 21:29:18,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-25 21:29:18,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 21:29:18,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 21:29:18,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-25 21:29:18,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-25 21:29:18,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 21:29:18,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 21:29:18,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 58: [2022-11-25 21:29:18,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 34: [2022-11-25 21:29:18,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-25 21:29:18,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 43: [2022-11-25 21:29:18,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 16: [2022-11-25 21:29:18,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 43: [2022-11-25 21:29:18,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-25 21:29:18,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 21:29:18,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 21:29:18,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-25 21:29:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 21:29:18,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 21:29:18,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 21:29:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 21:29:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 21:29:18,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 21:29:18,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 21:29:18,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 18: [2022-11-25 21:29:18,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 21:29:18,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 21:29:18,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-25 21:29:18,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 21:29:18,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-25 21:29:18,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-25 21:29:18,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 21:29:18,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 21:29:18,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-25 21:29:18,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-25 21:29:18,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 21:29:18,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 21:29:18,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 21:29:18,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 21:29:18,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 36: [2022-11-25 21:29:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 9: [2022-11-25 21:29:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 21:29:18,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 21:29:18,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 14: [2022-11-25 21:29:18,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 50: [2022-11-25 21:29:18,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-25 21:29:18,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 21:29:18,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 21:29:18,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 10: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 36: [2022-11-25 21:29:18,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 28: [2022-11-25 21:29:18,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-25 21:29:18,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 21:29:18,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 21:29:18,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-25 21:29:18,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-25 21:29:18,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 21:29:18,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 21:29:18,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 21:29:18,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 21:29:18,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 21:29:18,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 21:29:18,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 43: [2022-11-25 21:29:18,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 21:29:18,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-25 21:29:18,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 21:29:18,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-25 21:29:18,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-25 21:29:18,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 21:29:18,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 21:29:18,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 21: [2022-11-25 21:29:18,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 56: [2022-11-25 21:29:18,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-25 21:29:18,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 21:29:18,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 21:29:18,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 21:29:18,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 21:29:18,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-25 21:29:18,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-25 21:29:18,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 22: [2022-11-25 21:29:18,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 48: [2022-11-25 21:29:18,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 21:29:18,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 34: [2022-11-25 21:29:18,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 21:29:18,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 21:29:18,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 21:29:18,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 21:29:18,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 21:29:18,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-25 21:29:18,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 21:29:18,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 21:29:18,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 21:29:18,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 21:29:18,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-25 21:29:18,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-25 21:29:18,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-25 21:29:18,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 1: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 21:29:18,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 58: [2022-11-25 21:29:18,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 1: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 21:29:18,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 21:29:18,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-25 21:29:18,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 21:29:18,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 21:29:18,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 42: [2022-11-25 21:29:18,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-25 21:29:18,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 29: [2022-11-25 21:29:18,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 21:29:18,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 43: [2022-11-25 21:29:18,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 21:29:18,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-25 21:29:18,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 21:29:18,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 21:29:18,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 21:29:18,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 21:29:18,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 21:29:18,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 21:29:18,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 34: [2022-11-25 21:29:18,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 34: [2022-11-25 21:29:18,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-25 21:29:18,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 8: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 48: [2022-11-25 21:29:18,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 8: [2022-11-25 21:29:18,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 48: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 30: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 32: [2022-11-25 21:29:18,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-25 21:29:18,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 28: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 14: [2022-11-25 21:29:18,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 28: [2022-11-25 21:29:18,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 28: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 21:29:18,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 21:29:18,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-25 21:29:18,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-25 21:29:18,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-25 21:29:18,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 21:29:18,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 21:29:18,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 21:29:18,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 21:29:18,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 3: [2022-11-25 21:29:18,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 27: [2022-11-25 21:29:18,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 21:29:18,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 31: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 50: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 23: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 21:29:18,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 1: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 58: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 39: [2022-11-25 21:29:18,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 21:29:18,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 43: [2022-11-25 21:29:18,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 21:29:18,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-25 21:29:18,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 21:29:18,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 21: [2022-11-25 21:29:18,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 56: [2022-11-25 21:29:18,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 21: [2022-11-25 21:29:18,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 21:29:18,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 21:29:18,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 21:29:18,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 52: [2022-11-25 21:29:18,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-25 21:29:18,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 21:29:18,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 21:29:18,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 45: [2022-11-25 21:29:18,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 21:29:18,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 21:29:18,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 14: [2022-11-25 21:29:18,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 21:29:18,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 21:29:18,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 21:29:18,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 39: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 21:29:18,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 19: [2022-11-25 21:29:18,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 39: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 19: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 7: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-25 21:29:18,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 44: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 15: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 44: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 15: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 13: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 27: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 44: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 13: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 15: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 13: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 27: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 4: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 27: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 4: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 5: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 34: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 5: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 35: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 5: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 34: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 34: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 9: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-25 21:29:18,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 21:29:18,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 6: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 21:29:18,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 30: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 21:29:18,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 18: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 21:29:18,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 29: [2022-11-25 21:29:18,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 21:29:18,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 38: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 38: [2022-11-25 21:29:18,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 23: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 25: [2022-11-25 21:29:18,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 38: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 25: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 23: [2022-11-25 21:29:18,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 36: [2022-11-25 21:29:18,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 23: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 36: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 11: [2022-11-25 21:29:18,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 8: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 63: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 0: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 37: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 47: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 21:29:18,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 31: [2022-11-25 21:29:18,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 47: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 31: [2022-11-25 21:29:18,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-25 21:29:18,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 32: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 21:29:18,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 2: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-25 21:29:18,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-25 21:29:18,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 52: [2022-11-25 21:29:18,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-25 21:29:18,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 21:29:18,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 3: [2022-11-25 21:29:18,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-25 21:29:18,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 1: [2022-11-25 21:29:18,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 21:29:18,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-25 21:29:18,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 48: [2022-11-25 21:29:18,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 21:29:18,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 21:29:18,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 43: [2022-11-25 21:29:18,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 21:29:18,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 21:29:18,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 50: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 21:29:18,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 50: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 38: [2022-11-25 21:29:18,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 41: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 21:29:18,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 58: [2022-11-25 21:29:18,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 21:29:18,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 21:29:18,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 22: [2022-11-25 21:29:18,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 62: [2022-11-25 21:29:18,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 22: [2022-11-25 21:29:18,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 62: [2022-11-25 21:29:18,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 22: [2022-11-25 21:29:18,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 57: [2022-11-25 21:29:18,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 21:29:18,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 21:29:18,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 54: [2022-11-25 21:29:18,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-25 21:29:18,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 21:29:18,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 33: [2022-11-25 21:29:18,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 21:29:18,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 21:29:18,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 21:29:18,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 21:29:18,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 63: [2022-11-25 21:29:18,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 17: [2022-11-25 21:29:18,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 21:29:18,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 21:29:18,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 51: [2022-11-25 21:29:18,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 21:29:18,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 56: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 28: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 56: [2022-11-25 21:29:18,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 28: [2022-11-25 21:29:18,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 46: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 21:29:18,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 62: [2022-11-25 21:29:18,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 21:29:18,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 21:29:18,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 12: [2022-11-25 21:29:18,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 21:29:18,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 21:29:18,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 55: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 21:29:18,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 20: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 21:29:18,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 35: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 21:29:18,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 24: [2022-11-25 21:29:18,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 21:29:18,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 21:29:18,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 53: [2022-11-25 21:29:18,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 21:29:18,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 21:29:18,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 60: [2022-11-25 21:29:18,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 21:29:18,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 21:29:18,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-25 21:29:18,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 21: [2022-11-25 21:29:18,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 21:29:18,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 21:29:18,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 40: [2022-11-25 21:29:18,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 21:29:18,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 21:29:18,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 15: [2022-11-25 21:29:18,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 21:29:18,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 21:29:18,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 49: [2022-11-25 21:29:18,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 21:29:18,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 21:29:18,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 61: [2022-11-25 21:29:18,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 21:29:18,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 21:29:18,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-25 21:29:18,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-25 21:29:18,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 26: [2022-11-25 21:29:18,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 59: [2022-11-25 21:29:18,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 21:29:18,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 21:29:18,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 10: [2022-11-25 21:29:18,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 21:29:18,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 21:29:18,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 42: [2022-11-25 21:29:18,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-25 21:29:18,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 21:29:18,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 21:29:18,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 16: [2022-11-25 21:29:18,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 21:29:18,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step6000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 21:29:18,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! 0: successfully saved checkpoint at iteration 6000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5443.20 63: iteration 6010/ 24424 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 2.83 | learning rate: 1.759E-04 | global batch size: 512 | lm loss: 2.254332E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.792 | TFLOPs: 18.61 | 63: iteration 6020/ 24424 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 2.26 | learning rate: 1.758E-04 | global batch size: 512 | lm loss: 2.256562E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.532 | TFLOPs: 23.32 | 63: iteration 6030/ 24424 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 2.27 | learning rate: 1.757E-04 | global batch size: 512 | lm loss: 2.249885E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.880 | TFLOPs: 23.25 | 63: iteration 6040/ 24424 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 2.24 | learning rate: 1.757E-04 | global batch size: 512 | lm loss: 2.269367E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.259 | TFLOPs: 23.50 | 63: iteration 6050/ 24424 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 2.25 | learning rate: 1.756E-04 | global batch size: 512 | lm loss: 2.253257E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.166 | TFLOPs: 23.39 | 63: iteration 6060/ 24424 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 2.26 | learning rate: 1.755E-04 | global batch size: 512 | lm loss: 2.253158E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.886 | TFLOPs: 23.36 | 63: iteration 6070/ 24424 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 3.03 | learning rate: 1.754E-04 | global batch size: 512 | lm loss: 2.281628E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.196 | TFLOPs: 17.42 | 63: iteration 6080/ 24424 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 2.23 | learning rate: 1.753E-04 | global batch size: 512 | lm loss: 2.269288E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.921 | TFLOPs: 23.67 | 63: iteration 6090/ 24424 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 2.27 | learning rate: 1.753E-04 | global batch size: 512 | lm loss: 2.231600E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.621 | TFLOPs: 23.23 | 63: iteration 6100/ 24424 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 2.26 | learning rate: 1.752E-04 | global batch size: 512 | lm loss: 2.264396E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.751 | TFLOPs: 23.34 | 63: iteration 6110/ 24424 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 2.23 | learning rate: 1.751E-04 | global batch size: 512 | lm loss: 2.271792E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.117 | TFLOPs: 23.59 | 63: iteration 6120/ 24424 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 2.23 | learning rate: 1.750E-04 | global batch size: 512 | lm loss: 2.251569E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.137 | TFLOPs: 23.59 | 63: iteration 6130/ 24424 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 2.27 | learning rate: 1.749E-04 | global batch size: 512 | lm loss: 2.265433E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.724 | TFLOPs: 23.24 | 63: iteration 6140/ 24424 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 2.24 | learning rate: 1.749E-04 | global batch size: 512 | lm loss: 2.265886E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.385 | TFLOPs: 23.51 | 63: iteration 6150/ 24424 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 2.25 | learning rate: 1.748E-04 | global batch size: 512 | lm loss: 2.265625E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.819 | TFLOPs: 23.45 | 63: iteration 6160/ 24424 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 2.28 | learning rate: 1.747E-04 | global batch size: 512 | lm loss: 2.263001E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.695 | TFLOPs: 23.13 | 63: iteration 6170/ 24424 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 2.26 | learning rate: 1.746E-04 | global batch size: 512 | lm loss: 2.276907E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.774 | TFLOPs: 23.35 | 63: iteration 6180/ 24424 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 2.24 | learning rate: 1.745E-04 | global batch size: 512 | lm loss: 2.264170E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.167 | TFLOPs: 23.49 | 63: iteration 6190/ 24424 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 2.23 | learning rate: 1.745E-04 | global batch size: 512 | lm loss: 2.258610E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.527 | TFLOPs: 23.63 | 63: iteration 6200/ 24424 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 2.23 | learning rate: 1.744E-04 | global batch size: 512 | lm loss: 2.257359E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.492 | TFLOPs: 23.63 | 63: iteration 6210/ 24424 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 2.23 | learning rate: 1.743E-04 | global batch size: 512 | lm loss: 2.271551E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.631 | TFLOPs: 23.64 | 63: iteration 6220/ 24424 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 2.24 | learning rate: 1.742E-04 | global batch size: 512 | lm loss: 2.251992E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.275 | TFLOPs: 23.50 | 63: iteration 6230/ 24424 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 2.25 | learning rate: 1.741E-04 | global batch size: 512 | lm loss: 2.271926E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.493 | TFLOPs: 23.42 | 63: iteration 6240/ 24424 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 2.24 | learning rate: 1.740E-04 | global batch size: 512 | lm loss: 2.245878E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.754 | TFLOPs: 23.55 | 63: iteration 6250/ 24424 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 2.28 | learning rate: 1.740E-04 | global batch size: 512 | lm loss: 2.233242E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.932 | TFLOPs: 23.16 | 63: iteration 6260/ 24424 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 2.27 | learning rate: 1.739E-04 | global batch size: 512 | lm loss: 2.251236E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.938 | TFLOPs: 23.26 | 63: iteration 6270/ 24424 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 2.25 | learning rate: 1.738E-04 | global batch size: 512 | lm loss: 2.233199E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.052 | TFLOPs: 23.48 | 63: iteration 6280/ 24424 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 2.23 | learning rate: 1.737E-04 | global batch size: 512 | lm loss: 2.273436E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.658 | TFLOPs: 23.64 | 63: iteration 6290/ 24424 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 2.24 | learning rate: 1.736E-04 | global batch size: 512 | lm loss: 2.254914E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.332 | TFLOPs: 23.51 | 63: iteration 6300/ 24424 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 2.23 | learning rate: 1.736E-04 | global batch size: 512 | lm loss: 2.262214E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.785 | TFLOPs: 23.66 | 63: iteration 6310/ 24424 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 2.24 | learning rate: 1.735E-04 | global batch size: 512 | lm loss: 2.233807E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.932 | TFLOPs: 23.57 | 63: iteration 6320/ 24424 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 2.23 | learning rate: 1.734E-04 | global batch size: 512 | lm loss: 2.266975E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.713 | TFLOPs: 23.65 | 63: iteration 6330/ 24424 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 2.28 | learning rate: 1.733E-04 | global batch size: 512 | lm loss: 2.239842E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.201 | TFLOPs: 23.08 | 63: iteration 6340/ 24424 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 2.28 | learning rate: 1.732E-04 | global batch size: 512 | lm loss: 2.259297E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.006 | TFLOPs: 23.16 | 63: iteration 6350/ 24424 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 2.23 | learning rate: 1.731E-04 | global batch size: 512 | lm loss: 2.243502E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.578 | TFLOPs: 23.63 | 63: iteration 6360/ 24424 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 2.24 | learning rate: 1.731E-04 | global batch size: 512 | lm loss: 2.235172E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.498 | TFLOPs: 23.52 | 63: iteration 6370/ 24424 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 2.31 | learning rate: 1.730E-04 | global batch size: 512 | lm loss: 2.231017E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.802 | TFLOPs: 22.83 | 63: iteration 6380/ 24424 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 2.25 | learning rate: 1.729E-04 | global batch size: 512 | lm loss: 2.260420E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.754 | TFLOPs: 23.45 | 63: iteration 6390/ 24424 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 2.25 | learning rate: 1.728E-04 | global batch size: 512 | lm loss: 2.241907E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.708 | TFLOPs: 23.44 | 63: iteration 6400/ 24424 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 2.23 | learning rate: 1.727E-04 | global batch size: 512 | lm loss: 2.245376E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.450 | TFLOPs: 23.62 | 63: iteration 6410/ 24424 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 2.25 | learning rate: 1.726E-04 | global batch size: 512 | lm loss: 2.244008E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.924 | TFLOPs: 23.46 | 63: iteration 6420/ 24424 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 2.25 | learning rate: 1.726E-04 | global batch size: 512 | lm loss: 2.242644E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.607 | TFLOPs: 23.43 | 63: iteration 6430/ 24424 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 2.24 | learning rate: 1.725E-04 | global batch size: 512 | lm loss: 2.219641E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.202 | TFLOPs: 23.49 | 63: iteration 6440/ 24424 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 2.28 | learning rate: 1.724E-04 | global batch size: 512 | lm loss: 2.241658E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.323 | TFLOPs: 23.09 | 63: iteration 6450/ 24424 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 2.23 | learning rate: 1.723E-04 | global batch size: 512 | lm loss: 2.247367E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.815 | TFLOPs: 23.66 | 63: iteration 6460/ 24424 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 2.23 | learning rate: 1.722E-04 | global batch size: 512 | lm loss: 2.262820E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.022 | TFLOPs: 23.68 | 63: iteration 6470/ 24424 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 2.26 | learning rate: 1.721E-04 | global batch size: 512 | lm loss: 2.254323E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.980 | TFLOPs: 23.37 | 63: iteration 6480/ 24424 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 2.24 | learning rate: 1.720E-04 | global batch size: 512 | lm loss: 2.224771E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.328 | TFLOPs: 23.51 | 63: iteration 6490/ 24424 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 2.26 | learning rate: 1.720E-04 | global batch size: 512 | lm loss: 2.242144E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.350 | TFLOPs: 23.30 | 63: iteration 6500/ 24424 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 2.25 | learning rate: 1.719E-04 | global batch size: 512 | lm loss: 2.230399E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.912 | TFLOPs: 23.46 | 63: iteration 6510/ 24424 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 2.24 | learning rate: 1.718E-04 | global batch size: 512 | lm loss: 2.245505E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.818 | TFLOPs: 23.56 | 63: iteration 6520/ 24424 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 2.25 | learning rate: 1.717E-04 | global batch size: 512 | lm loss: 2.231743E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.006 | TFLOPs: 23.47 | 63: iteration 6530/ 24424 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 2.37 | learning rate: 1.716E-04 | global batch size: 512 | lm loss: 2.217550E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 216.013 | TFLOPs: 22.24 | 63: iteration 6540/ 24424 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 2.27 | learning rate: 1.715E-04 | global batch size: 512 | lm loss: 2.256227E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.439 | TFLOPs: 23.21 | 63: iteration 6550/ 24424 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 3.35 | learning rate: 1.714E-04 | global batch size: 512 | lm loss: 2.248542E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 152.661 | TFLOPs: 15.72 | 63: iteration 6560/ 24424 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 2.23 | learning rate: 1.714E-04 | global batch size: 512 | lm loss: 2.236525E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.616 | TFLOPs: 23.64 | 63: iteration 6570/ 24424 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 2.25 | learning rate: 1.713E-04 | global batch size: 512 | lm loss: 2.207000E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.780 | TFLOPs: 23.45 | 63: iteration 6580/ 24424 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 2.23 | learning rate: 1.712E-04 | global batch size: 512 | lm loss: 2.245292E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.807 | TFLOPs: 23.66 | 63: iteration 6590/ 24424 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 2.26 | learning rate: 1.711E-04 | global batch size: 512 | lm loss: 2.221865E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.299 | TFLOPs: 23.30 | 63: iteration 6600/ 24424 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 2.25 | learning rate: 1.710E-04 | global batch size: 512 | lm loss: 2.239532E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.456 | TFLOPs: 23.42 | 63: iteration 6610/ 24424 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 2.24 | learning rate: 1.709E-04 | global batch size: 512 | lm loss: 2.230744E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.893 | TFLOPs: 23.56 | 63: iteration 6620/ 24424 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 2.25 | learning rate: 1.708E-04 | global batch size: 512 | lm loss: 2.241034E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.384 | TFLOPs: 23.41 | 63: iteration 6630/ 24424 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 2.23 | learning rate: 1.708E-04 | global batch size: 512 | lm loss: 2.248434E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.839 | TFLOPs: 23.66 | 63: iteration 6640/ 24424 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 2.25 | learning rate: 1.707E-04 | global batch size: 512 | lm loss: 2.231226E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.208 | TFLOPs: 23.39 | 63: iteration 6650/ 24424 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 2.23 | learning rate: 1.706E-04 | global batch size: 512 | lm loss: 2.248961E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.534 | TFLOPs: 23.63 | 63: iteration 6660/ 24424 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 2.23 | learning rate: 1.705E-04 | global batch size: 512 | lm loss: 2.237458E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.520 | TFLOPs: 23.63 | 63: iteration 6670/ 24424 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 2.24 | learning rate: 1.704E-04 | global batch size: 512 | lm loss: 2.228644E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.137 | TFLOPs: 23.49 | 63: iteration 6680/ 24424 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 2.23 | learning rate: 1.703E-04 | global batch size: 512 | lm loss: 2.231936E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.088 | TFLOPs: 23.58 | 63: iteration 6690/ 24424 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 2.23 | learning rate: 1.702E-04 | global batch size: 512 | lm loss: 2.231577E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.432 | TFLOPs: 23.62 | 63: iteration 6700/ 24424 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 2.25 | learning rate: 1.702E-04 | global batch size: 512 | lm loss: 2.238652E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.610 | TFLOPs: 23.43 | 63: iteration 6710/ 24424 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 2.34 | learning rate: 1.701E-04 | global batch size: 512 | lm loss: 2.266676E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.202 | TFLOPs: 22.57 | 63: iteration 6720/ 24424 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 2.34 | learning rate: 1.700E-04 | global batch size: 512 | lm loss: 2.225223E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.652 | TFLOPs: 22.51 | 63: iteration 6730/ 24424 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 2.25 | learning rate: 1.699E-04 | global batch size: 512 | lm loss: 2.249381E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.232 | TFLOPs: 23.39 | 63: iteration 6740/ 24424 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 2.25 | learning rate: 1.698E-04 | global batch size: 512 | lm loss: 2.216304E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.431 | TFLOPs: 23.41 | 63: iteration 6750/ 24424 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 2.23 | learning rate: 1.697E-04 | global batch size: 512 | lm loss: 2.212755E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.009 | TFLOPs: 23.68 | 63: iteration 6760/ 24424 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 2.25 | learning rate: 1.696E-04 | global batch size: 512 | lm loss: 2.249350E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.151 | TFLOPs: 23.38 | 63: iteration 6770/ 24424 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 2.26 | learning rate: 1.695E-04 | global batch size: 512 | lm loss: 2.234540E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.619 | TFLOPs: 23.33 | 63: iteration 6780/ 24424 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 2.25 | learning rate: 1.695E-04 | global batch size: 512 | lm loss: 2.229228E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.763 | TFLOPs: 23.45 | 63: iteration 6790/ 24424 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 2.23 | learning rate: 1.694E-04 | global batch size: 512 | lm loss: 2.235145E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.940 | TFLOPs: 23.67 | 63: iteration 6800/ 24424 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 2.24 | learning rate: 1.693E-04 | global batch size: 512 | lm loss: 2.243936E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.266 | TFLOPs: 23.50 | 63: iteration 6810/ 24424 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 2.26 | learning rate: 1.692E-04 | global batch size: 512 | lm loss: 2.225191E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.807 | TFLOPs: 23.35 | 63: iteration 6820/ 24424 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 2.23 | learning rate: 1.691E-04 | global batch size: 512 | lm loss: 2.221887E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.840 | TFLOPs: 23.66 | 63: iteration 6830/ 24424 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 2.25 | learning rate: 1.690E-04 | global batch size: 512 | lm loss: 2.235420E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.481 | TFLOPs: 23.42 | 63: iteration 6840/ 24424 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 2.23 | learning rate: 1.689E-04 | global batch size: 512 | lm loss: 2.234340E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.639 | TFLOPs: 23.64 | 63: iteration 6850/ 24424 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 2.26 | learning rate: 1.688E-04 | global batch size: 512 | lm loss: 2.247924E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.302 | TFLOPs: 23.30 | 63: iteration 6860/ 24424 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 3.62 | learning rate: 1.688E-04 | global batch size: 512 | lm loss: 2.214444E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.572 | TFLOPs: 14.57 | 63: iteration 6870/ 24424 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 2.24 | learning rate: 1.687E-04 | global batch size: 512 | lm loss: 2.236964E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.403 | TFLOPs: 23.51 | 63: iteration 6880/ 24424 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 2.25 | learning rate: 1.686E-04 | global batch size: 512 | lm loss: 2.220096E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.119 | TFLOPs: 23.38 | 63: iteration 6890/ 24424 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 2.28 | learning rate: 1.685E-04 | global batch size: 512 | lm loss: 2.236169E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.788 | TFLOPs: 23.14 | 63: iteration 6900/ 24424 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 2.26 | learning rate: 1.684E-04 | global batch size: 512 | lm loss: 2.239055E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.073 | TFLOPs: 23.27 | 63: iteration 6910/ 24424 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 2.28 | learning rate: 1.683E-04 | global batch size: 512 | lm loss: 2.214980E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.944 | TFLOPs: 23.16 | 63: iteration 6920/ 24424 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 2.23 | learning rate: 1.682E-04 | global batch size: 512 | lm loss: 2.228370E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.349 | TFLOPs: 23.61 | 63: iteration 6930/ 24424 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 2.26 | learning rate: 1.681E-04 | global batch size: 512 | lm loss: 2.252635E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.993 | TFLOPs: 23.37 | 63: iteration 6940/ 24424 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 2.24 | learning rate: 1.680E-04 | global batch size: 512 | lm loss: 2.214255E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.231 | TFLOPs: 23.50 | 63: iteration 6950/ 24424 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 2.26 | learning rate: 1.679E-04 | global batch size: 512 | lm loss: 2.212604E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.742 | TFLOPs: 23.34 | 63: iteration 6960/ 24424 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 2.24 | learning rate: 1.679E-04 | global batch size: 512 | lm loss: 2.226325E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.039 | TFLOPs: 23.58 | 63: iteration 6970/ 24424 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 2.24 | learning rate: 1.678E-04 | global batch size: 512 | lm loss: 2.234931E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.506 | TFLOPs: 23.52 | 63: iteration 6980/ 24424 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 2.25 | learning rate: 1.677E-04 | global batch size: 512 | lm loss: 2.232955E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.022 | TFLOPs: 23.47 | 63: iteration 6990/ 24424 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 2.24 | learning rate: 1.676E-04 | global batch size: 512 | lm loss: 2.219583E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.232 | TFLOPs: 23.50 | 63: iteration 7000/ 24424 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 2.26 | learning rate: 1.675E-04 | global batch size: 512 | lm loss: 2.199036E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.237 | TFLOPs: 23.29 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 7000 | lm loss value: 2.179373E+00 | lm loss PPL: 8.840759E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 7000 to checkpoints_3b9 0: [2022-11-25 22:07:22,566] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is begin to save! 0: [2022-11-25 22:07:22,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_01-model_00-model_states.pt... 32: [2022-11-25 22:07:22,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_21-model_00-model_states.pt... 32: [2022-11-25 22:07:22,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_21-model_00-model_states.pt. 32: [2022-11-25 22:07:22,872] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_22-model_00-model_states.pt... 0: [2022-11-25 22:07:22,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_01-model_00-model_states.pt. 0: [2022-11-25 22:07:22,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_03-model_00-model_states.pt... 32: [2022-11-25 22:07:23,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_22-model_00-model_states.pt. 32: [2022-11-25 22:07:23,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_23-model_00-model_states.pt... 0: [2022-11-25 22:07:23,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_03-model_00-model_states.pt. 0: [2022-11-25 22:07:23,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_04-model_00-model_states.pt... 32: [2022-11-25 22:07:23,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_23-model_00-model_states.pt. 32: [2022-11-25 22:07:23,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_24-model_00-model_states.pt... 0: [2022-11-25 22:07:23,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_04-model_00-model_states.pt. 0: [2022-11-25 22:07:23,437] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_05-model_00-model_states.pt... 32: [2022-11-25 22:07:23,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_24-model_00-model_states.pt. 32: [2022-11-25 22:07:23,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_25-model_00-model_states.pt... 0: [2022-11-25 22:07:23,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_05-model_00-model_states.pt. 0: [2022-11-25 22:07:23,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_06-model_00-model_states.pt... 32: [2022-11-25 22:07:23,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_25-model_00-model_states.pt. 32: [2022-11-25 22:07:23,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_26-model_00-model_states.pt... 0: [2022-11-25 22:07:23,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_06-model_00-model_states.pt. 0: [2022-11-25 22:07:23,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_07-model_00-model_states.pt... 32: [2022-11-25 22:07:24,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_26-model_00-model_states.pt. 32: [2022-11-25 22:07:24,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_27-model_00-model_states.pt... 0: [2022-11-25 22:07:24,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_07-model_00-model_states.pt. 0: [2022-11-25 22:07:24,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_08-model_00-model_states.pt... 32: [2022-11-25 22:07:24,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_27-model_00-model_states.pt. 32: [2022-11-25 22:07:24,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_28-model_00-model_states.pt... 0: [2022-11-25 22:07:24,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_08-model_00-model_states.pt. 0: [2022-11-25 22:07:24,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_09-model_00-model_states.pt... 32: [2022-11-25 22:07:24,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_28-model_00-model_states.pt. 32: [2022-11-25 22:07:24,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_29-model_00-model_states.pt... 0: [2022-11-25 22:07:24,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_09-model_00-model_states.pt. 0: [2022-11-25 22:07:24,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_10-model_00-model_states.pt... 32: [2022-11-25 22:07:24,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_29-model_00-model_states.pt. 32: [2022-11-25 22:07:24,742] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_30-model_00-model_states.pt... 0: [2022-11-25 22:07:24,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_10-model_00-model_states.pt. 0: [2022-11-25 22:07:24,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_11-model_00-model_states.pt... 32: [2022-11-25 22:07:24,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_30-model_00-model_states.pt. 32: [2022-11-25 22:07:24,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_31-model_00-model_states.pt... 0: [2022-11-25 22:07:25,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_11-model_00-model_states.pt. 0: [2022-11-25 22:07:25,023] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_12-model_00-model_states.pt... 32: [2022-11-25 22:07:25,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_31-model_00-model_states.pt. 32: [2022-11-25 22:07:25,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_32-model_00-model_states.pt... 0: [2022-11-25 22:07:25,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_12-model_00-model_states.pt. 0: [2022-11-25 22:07:25,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_13-model_00-model_states.pt... 32: [2022-11-25 22:07:25,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_32-model_00-model_states.pt. 32: [2022-11-25 22:07:25,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_33-model_00-model_states.pt... 0: [2022-11-25 22:07:25,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_13-model_00-model_states.pt. 0: [2022-11-25 22:07:25,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_14-model_00-model_states.pt... 32: [2022-11-25 22:07:25,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_33-model_00-model_states.pt. 32: [2022-11-25 22:07:25,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_34-model_00-model_states.pt... 0: [2022-11-25 22:07:25,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_14-model_00-model_states.pt. 0: [2022-11-25 22:07:25,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_15-model_00-model_states.pt... 32: [2022-11-25 22:07:25,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_34-model_00-model_states.pt. 32: [2022-11-25 22:07:25,887] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_35-model_00-model_states.pt... 0: [2022-11-25 22:07:25,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_15-model_00-model_states.pt. 0: [2022-11-25 22:07:25,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_16-model_00-model_states.pt... 32: [2022-11-25 22:07:26,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_35-model_00-model_states.pt. 32: [2022-11-25 22:07:26,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_36-model_00-model_states.pt... 0: [2022-11-25 22:07:26,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_16-model_00-model_states.pt. 0: [2022-11-25 22:07:26,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_17-model_00-model_states.pt... 0: [2022-11-25 22:07:26,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_17-model_00-model_states.pt. 0: [2022-11-25 22:07:26,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_18-model_00-model_states.pt... 32: [2022-11-25 22:07:26,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_36-model_00-model_states.pt. 32: [2022-11-25 22:07:26,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_37-model_00-model_states.pt... 0: [2022-11-25 22:07:26,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_18-model_00-model_states.pt. 0: [2022-11-25 22:07:26,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_19-model_00-model_states.pt... 32: [2022-11-25 22:07:26,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_37-model_00-model_states.pt. 32: [2022-11-25 22:07:26,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_38-model_00-model_states.pt... 0: [2022-11-25 22:07:26,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_19-model_00-model_states.pt. 0: [2022-11-25 22:07:26,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_20-model_00-model_states.pt... 32: [2022-11-25 22:07:26,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_38-model_00-model_states.pt. 32: [2022-11-25 22:07:26,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/layer_40-model_00-model_states.pt... 32: [2022-11-25 22:07:26,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_40-model_00-model_states.pt. 32: [2022-11-25 22:07:26,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/mp_rank_01_model_states.pt... 32: [2022-11-25 22:07:26,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/mp_rank_01_model_states.pt. 0: [2022-11-25 22:07:26,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/layer_20-model_00-model_states.pt. 0: [2022-11-25 22:07:26,989] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step7000/mp_rank_00_model_states.pt 0: [2022-11-25 22:07:26,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/mp_rank_00_model_states.pt... 0: [2022-11-25 22:07:26,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/mp_rank_00_model_states.pt. 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 35: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 19: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 33: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 19: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 16: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 19: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 44: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 21: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 32: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 13: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:07:27,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 54: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:07:27,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step7000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:07:27,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 21: [2022-11-25 22:07:27,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 22:07:27,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 31: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 36: [2022-11-25 22:07:27,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 41: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 41: [2022-11-25 22:07:27,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 51: [2022-11-25 22:07:27,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:07:27,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 22:07:27,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:07:27,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:07:27,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:07:27,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:07:27,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 18: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 44: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 57: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 13: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 57: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 6: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 54: [2022-11-25 22:07:27,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 0: [2022-11-25 22:07:27,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 0: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:07:27,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:07:27,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 60: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 24: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 60: [2022-11-25 22:07:27,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 55: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 14: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 55: [2022-11-25 22:07:27,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 14: [2022-11-25 22:07:27,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 26: [2022-11-25 22:07:27,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 49: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:07:27,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 9: [2022-11-25 22:07:27,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 50: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 33: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 1: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 61: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 1: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 41: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 2: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 41: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 2: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 41: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 3: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 51: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 42: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 4: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:07:27,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:07:27,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 41: [2022-11-25 22:07:27,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 2: [2022-11-25 22:07:27,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 41: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 22:07:27,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 29: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 54: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 29: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 55: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 54: [2022-11-25 22:07:27,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:07:27,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:07:27,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:07:27,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:07:27,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 42: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 0: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 42: [2022-11-25 22:07:27,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:07:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:07:27,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 22:07:27,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 22:07:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 22:07:27,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 8: [2022-11-25 22:07:27,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 43: [2022-11-25 22:07:27,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 8: [2022-11-25 22:07:27,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 43: [2022-11-25 22:07:27,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:07:27,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:07:27,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:07:27,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:07:27,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 22:07:27,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:07:27,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 22:07:27,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-25 22:07:27,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 40: [2022-11-25 22:07:27,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 13: [2022-11-25 22:07:27,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 41: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:07:27,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:07:27,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 60: [2022-11-25 22:07:27,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 21: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 42: [2022-11-25 22:07:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 21: [2022-11-25 22:07:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 42: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:07:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:07:27,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:07:27,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-25 22:07:27,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:07:27,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-25 22:07:27,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 22:07:27,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:07:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 22:07:27,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 51: [2022-11-25 22:07:27,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 35: [2022-11-25 22:07:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 21: [2022-11-25 22:07:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 35: [2022-11-25 22:07:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 22:07:27,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:07:27,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 22:07:27,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:07:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 22:07:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:07:27,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:07:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:07:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 39: [2022-11-25 22:07:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 27: [2022-11-25 22:07:27,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 39: [2022-11-25 22:07:27,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 27: [2022-11-25 22:07:27,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:07:27,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-25 22:07:27,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:07:27,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:07:27,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:07:27,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 22:07:27,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 41: [2022-11-25 22:07:27,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 22:07:27,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:07:27,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:07:27,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:07:27,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:07:27,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 22:07:27,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 15: [2022-11-25 22:07:27,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 43: [2022-11-25 22:07:27,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 22:07:27,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 22:07:27,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 51: [2022-11-25 22:07:27,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 22:07:27,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:07:27,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 22:07:27,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:07:27,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:07:27,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:07:27,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:07:27,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:07:27,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 22:07:27,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:07:27,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 46: [2022-11-25 22:07:27,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 9: [2022-11-25 22:07:27,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 3: [2022-11-25 22:07:27,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:07:27,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 22:07:27,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 60: [2022-11-25 22:07:27,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-25 22:07:27,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 60: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 18: [2022-11-25 22:07:27,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 60: [2022-11-25 22:07:27,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:07:27,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 41: [2022-11-25 22:07:27,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 22:07:27,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:07:27,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:07:27,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:07:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 51: [2022-11-25 22:07:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 15: [2022-11-25 22:07:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 51: [2022-11-25 22:07:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 15: [2022-11-25 22:07:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 51: [2022-11-25 22:07:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 44: [2022-11-25 22:07:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 24: [2022-11-25 22:07:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:07:27,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:07:27,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:07:27,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:07:27,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 22:07:27,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:07:27,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 22:07:27,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:07:27,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 22:07:27,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-25 22:07:27,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:07:27,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:07:27,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:07:27,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 22:07:27,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:07:27,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 22:07:27,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:07:27,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:07:27,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 60: [2022-11-25 22:07:27,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:07:27,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:07:27,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 60: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 21: [2022-11-25 22:07:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 60: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:07:27,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 41: [2022-11-25 22:07:27,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 22:07:27,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:07:27,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:07:27,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:07:27,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:07:27,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 22:07:27,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:07:27,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 22:07:27,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 22:07:27,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 22:07:27,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 62: [2022-11-25 22:07:27,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:07:27,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 22:07:27,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 10: [2022-11-25 22:07:27,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:07:27,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:07:27,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 0: [2022-11-25 22:07:27,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 10: [2022-11-25 22:07:27,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: [2022-11-25 22:07:27,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 22:07:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 2: [2022-11-25 22:07:27,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:07:27,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 22:07:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 36: [2022-11-25 22:07:27,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:07:27,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 22:07:27,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 38: [2022-11-25 22:07:27,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 6: [2022-11-25 22:07:27,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 38: [2022-11-25 22:07:27,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 6: [2022-11-25 22:07:27,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 27: [2022-11-25 22:07:27,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:07:27,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 22:07:27,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 54: [2022-11-25 22:07:27,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:07:27,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 22:07:27,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 7: [2022-11-25 22:07:27,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:07:27,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-25 22:07:27,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 17: [2022-11-25 22:07:27,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 42: [2022-11-25 22:07:27,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 17: [2022-11-25 22:07:27,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 22:07:27,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 42: [2022-11-25 22:07:27,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 22:07:27,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 51: [2022-11-25 22:07:27,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 46: [2022-11-25 22:07:27,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:07:27,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 19: [2022-11-25 22:07:27,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 46: [2022-11-25 22:07:27,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 19: [2022-11-25 22:07:27,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 22:07:27,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 3: [2022-11-25 22:07:27,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:07:27,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 22:07:27,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 16: [2022-11-25 22:07:27,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:07:27,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 8: [2022-11-25 22:07:27,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 22:07:27,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 22:07:27,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 22:07:27,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 60: [2022-11-25 22:07:27,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:07:27,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 22:07:27,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 21: [2022-11-25 22:07:27,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:07:27,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 22:07:27,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 31: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 45: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 31: [2022-11-25 22:07:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 45: [2022-11-25 22:07:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 31: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 45: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 4: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:07:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:07:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 23: [2022-11-25 22:07:27,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:07:27,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 22:07:27,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:07:27,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:07:27,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 49: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 14: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 49: [2022-11-25 22:07:27,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 14: [2022-11-25 22:07:27,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 22:07:27,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 37: [2022-11-25 22:07:27,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:07:27,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 22:07:27,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 40: [2022-11-25 22:07:27,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:07:27,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 22:07:27,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 1: [2022-11-25 22:07:27,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:07:27,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 22:07:27,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 58: [2022-11-25 22:07:27,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:07:27,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 22:07:27,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 59: [2022-11-25 22:07:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:07:27,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 9: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:07:27,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 32: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:07:27,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 32: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 57: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 43: [2022-11-25 22:07:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 51: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:07:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 22:07:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 55: [2022-11-25 22:07:27,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:07:27,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 22:07:27,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 44: [2022-11-25 22:07:27,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:07:27,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 22:07:27,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:07:27,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 18: [2022-11-25 22:07:27,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 22:07:27,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 38: [2022-11-25 22:07:27,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 30: [2022-11-25 22:07:27,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 38: [2022-11-25 22:07:27,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 30: [2022-11-25 22:07:27,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 50: [2022-11-25 22:07:27,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 30: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 50: [2022-11-25 22:07:27,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 29: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:07:27,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 15: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:07:27,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 20: [2022-11-25 22:07:27,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 11: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 11: [2022-11-25 22:07:27,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 33: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:07:27,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 11: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 33: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 26: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:07:27,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 22:07:27,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 12: [2022-11-25 22:07:27,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:07:27,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:07:27,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 22:07:27,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 28: [2022-11-25 22:07:27,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 22:07:27,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 63: [2022-11-25 22:07:27,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:07:27,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-25 22:07:27,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-25 22:07:27,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:07:27,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 25: [2022-11-25 22:07:27,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 52: [2022-11-25 22:07:27,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 25: [2022-11-25 22:07:27,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 52: [2022-11-25 22:07:27,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 22:07:27,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 13: [2022-11-25 22:07:27,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:07:27,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 22:07:27,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 48: [2022-11-25 22:07:27,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:07:27,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 22:07:27,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 5: [2022-11-25 22:07:27,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:07:27,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-25 22:07:27,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 39: [2022-11-25 22:07:27,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:07:27,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 22:07:27,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:07:27,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 56: [2022-11-25 22:07:27,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 53: [2022-11-25 22:07:27,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:07:27,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 22:07:27,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 41: [2022-11-25 22:07:27,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:07:27,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 22:07:27,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 34: [2022-11-25 22:07:27,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:07:27,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 22:07:27,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 22: [2022-11-25 22:07:27,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:07:27,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 22:07:27,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 61: [2022-11-25 22:07:27,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:07:27,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 22:07:27,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 24: [2022-11-25 22:07:27,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:07:27,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 22:07:27,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 35: [2022-11-25 22:07:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:07:27,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 22:07:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 47: [2022-11-25 22:07:27,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:07:27,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step7000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 22:07:27,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! 0: successfully saved checkpoint at iteration 7000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5247.90 63: iteration 7010/ 24424 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 4.59 | learning rate: 1.674E-04 | global batch size: 512 | lm loss: 2.246701E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 111.625 | TFLOPs: 11.49 | 63: iteration 7020/ 24424 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 2.55 | learning rate: 1.673E-04 | global batch size: 512 | lm loss: 2.227477E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 200.689 | TFLOPs: 20.66 | 63: iteration 7030/ 24424 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 2.26 | learning rate: 1.672E-04 | global batch size: 512 | lm loss: 2.204634E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.437 | TFLOPs: 23.31 | 63: iteration 7040/ 24424 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 2.23 | learning rate: 1.671E-04 | global batch size: 512 | lm loss: 2.212026E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.104 | TFLOPs: 23.59 | 63: iteration 7050/ 24424 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 2.23 | learning rate: 1.671E-04 | global batch size: 512 | lm loss: 2.212078E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.610 | TFLOPs: 23.64 | 63: iteration 7060/ 24424 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 2.25 | learning rate: 1.670E-04 | global batch size: 512 | lm loss: 2.227959E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.138 | TFLOPs: 23.38 | 63: iteration 7070/ 24424 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 2.26 | learning rate: 1.669E-04 | global batch size: 512 | lm loss: 2.222857E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.670 | TFLOPs: 23.33 | 63: iteration 7080/ 24424 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 2.27 | learning rate: 1.668E-04 | global batch size: 512 | lm loss: 2.245162E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.313 | TFLOPs: 23.19 | 63: iteration 7090/ 24424 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 2.24 | learning rate: 1.667E-04 | global batch size: 512 | lm loss: 2.235218E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.612 | TFLOPs: 23.53 | 63: iteration 7100/ 24424 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 2.26 | learning rate: 1.666E-04 | global batch size: 512 | lm loss: 2.220516E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.448 | TFLOPs: 23.31 | 63: iteration 7110/ 24424 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 2.25 | learning rate: 1.665E-04 | global batch size: 512 | lm loss: 2.215150E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.263 | TFLOPs: 23.40 | 63: iteration 7120/ 24424 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 2.23 | learning rate: 1.664E-04 | global batch size: 512 | lm loss: 2.222145E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.278 | TFLOPs: 23.60 | 63: iteration 7130/ 24424 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 2.26 | learning rate: 1.663E-04 | global batch size: 512 | lm loss: 2.235526E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.205 | TFLOPs: 23.29 | 63: iteration 7140/ 24424 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 2.26 | learning rate: 1.662E-04 | global batch size: 512 | lm loss: 2.207171E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.725 | TFLOPs: 23.34 | 63: iteration 7150/ 24424 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 2.29 | learning rate: 1.661E-04 | global batch size: 512 | lm loss: 2.203449E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.451 | TFLOPs: 23.00 | 63: iteration 7160/ 24424 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 2.26 | learning rate: 1.660E-04 | global batch size: 512 | lm loss: 2.182107E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.592 | TFLOPs: 23.33 | 63: iteration 7170/ 24424 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 3.16 | learning rate: 1.660E-04 | global batch size: 512 | lm loss: 2.208290E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 162.151 | TFLOPs: 16.69 | 63: iteration 7180/ 24424 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 2.34 | learning rate: 1.659E-04 | global batch size: 512 | lm loss: 2.242919E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.578 | TFLOPs: 22.50 | 63: iteration 7190/ 24424 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 2.29 | learning rate: 1.658E-04 | global batch size: 512 | lm loss: 2.223001E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.928 | TFLOPs: 23.05 | 63: iteration 7200/ 24424 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 2.25 | learning rate: 1.657E-04 | global batch size: 512 | lm loss: 2.203776E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.300 | TFLOPs: 23.40 | 63: iteration 7210/ 24424 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 2.25 | learning rate: 1.656E-04 | global batch size: 512 | lm loss: 2.193547E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.567 | TFLOPs: 23.43 | 63: iteration 7220/ 24424 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 2.24 | learning rate: 1.655E-04 | global batch size: 512 | lm loss: 2.200461E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.127 | TFLOPs: 23.48 | 63: iteration 7230/ 24424 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 2.24 | learning rate: 1.654E-04 | global batch size: 512 | lm loss: 2.213628E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.469 | TFLOPs: 23.52 | 63: iteration 7240/ 24424 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 2.28 | learning rate: 1.653E-04 | global batch size: 512 | lm loss: 2.212985E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.002 | TFLOPs: 23.16 | 63: iteration 7250/ 24424 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 2.23 | learning rate: 1.652E-04 | global batch size: 512 | lm loss: 2.202164E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.447 | TFLOPs: 23.62 | 63: iteration 7260/ 24424 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 2.25 | learning rate: 1.651E-04 | global batch size: 512 | lm loss: 2.245412E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.801 | TFLOPs: 23.45 | 63: iteration 7270/ 24424 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 2.26 | learning rate: 1.650E-04 | global batch size: 512 | lm loss: 2.213927E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.789 | TFLOPs: 23.35 | 63: iteration 7280/ 24424 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 2.27 | learning rate: 1.649E-04 | global batch size: 512 | lm loss: 2.195210E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.217 | TFLOPs: 23.19 | 63: iteration 7290/ 24424 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 2.26 | learning rate: 1.649E-04 | global batch size: 512 | lm loss: 2.194112E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.905 | TFLOPs: 23.36 | 63: iteration 7300/ 24424 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 2.34 | learning rate: 1.648E-04 | global batch size: 512 | lm loss: 2.227438E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.101 | TFLOPs: 22.56 | 63: iteration 7310/ 24424 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 2.23 | learning rate: 1.647E-04 | global batch size: 512 | lm loss: 2.219527E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.619 | TFLOPs: 23.64 | 63: iteration 7320/ 24424 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 2.23 | learning rate: 1.646E-04 | global batch size: 512 | lm loss: 2.189995E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.391 | TFLOPs: 23.61 | 63: iteration 7330/ 24424 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 2.26 | learning rate: 1.645E-04 | global batch size: 512 | lm loss: 2.201509E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.089 | TFLOPs: 23.27 | 63: iteration 7340/ 24424 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 2.23 | learning rate: 1.644E-04 | global batch size: 512 | lm loss: 2.211683E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.629 | TFLOPs: 23.64 | 63: iteration 7350/ 24424 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 2.28 | learning rate: 1.643E-04 | global batch size: 512 | lm loss: 2.210443E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.097 | TFLOPs: 23.07 | 63: iteration 7360/ 24424 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 2.29 | learning rate: 1.642E-04 | global batch size: 512 | lm loss: 2.205666E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.583 | TFLOPs: 23.02 | 63: iteration 7370/ 24424 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 2.26 | learning rate: 1.641E-04 | global batch size: 512 | lm loss: 2.212182E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.536 | TFLOPs: 23.32 | 63: iteration 7380/ 24424 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 2.25 | learning rate: 1.640E-04 | global batch size: 512 | lm loss: 2.202537E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.489 | TFLOPs: 23.42 | 63: iteration 7390/ 24424 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 2.25 | learning rate: 1.639E-04 | global batch size: 512 | lm loss: 2.215367E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.575 | TFLOPs: 23.43 | 63: iteration 7400/ 24424 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 2.25 | learning rate: 1.638E-04 | global batch size: 512 | lm loss: 2.209032E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.797 | TFLOPs: 23.45 | 63: iteration 7410/ 24424 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 2.25 | learning rate: 1.637E-04 | global batch size: 512 | lm loss: 2.210139E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.235 | TFLOPs: 23.39 | 63: iteration 7420/ 24424 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 2.24 | learning rate: 1.636E-04 | global batch size: 512 | lm loss: 2.205654E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.327 | TFLOPs: 23.51 | 63: iteration 7430/ 24424 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 2.25 | learning rate: 1.635E-04 | global batch size: 512 | lm loss: 2.191393E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.747 | TFLOPs: 23.45 | 63: iteration 7440/ 24424 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 2.25 | learning rate: 1.635E-04 | global batch size: 512 | lm loss: 2.212121E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.674 | TFLOPs: 23.44 | 63: iteration 7450/ 24424 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 2.24 | learning rate: 1.634E-04 | global batch size: 512 | lm loss: 2.181896E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.447 | TFLOPs: 23.52 | 63: iteration 7460/ 24424 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 2.29 | learning rate: 1.633E-04 | global batch size: 512 | lm loss: 2.201684E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.898 | TFLOPs: 23.05 | 63: iteration 7470/ 24424 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 2.23 | learning rate: 1.632E-04 | global batch size: 512 | lm loss: 2.198197E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.251 | TFLOPs: 23.60 | 63: iteration 7480/ 24424 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 2.25 | learning rate: 1.631E-04 | global batch size: 512 | lm loss: 2.171002E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.146 | TFLOPs: 23.38 | 63: iteration 7490/ 24424 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 2.77 | learning rate: 1.630E-04 | global batch size: 512 | lm loss: 2.199285E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.060 | TFLOPs: 19.05 | 63: iteration 7500/ 24424 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 2.23 | learning rate: 1.629E-04 | global batch size: 512 | lm loss: 2.218328E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.267 | TFLOPs: 23.60 | 63: iteration 7510/ 24424 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 2.27 | learning rate: 1.628E-04 | global batch size: 512 | lm loss: 2.197719E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.552 | TFLOPs: 23.22 | 63: iteration 7520/ 24424 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 2.23 | learning rate: 1.627E-04 | global batch size: 512 | lm loss: 2.227276E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.637 | TFLOPs: 23.64 | 63: iteration 7530/ 24424 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 2.24 | learning rate: 1.626E-04 | global batch size: 512 | lm loss: 2.181417E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.407 | TFLOPs: 23.51 | 63: iteration 7540/ 24424 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 2.23 | learning rate: 1.625E-04 | global batch size: 512 | lm loss: 2.191878E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.339 | TFLOPs: 23.61 | 63: iteration 7550/ 24424 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 2.25 | learning rate: 1.624E-04 | global batch size: 512 | lm loss: 2.186790E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.487 | TFLOPs: 23.42 | 63: iteration 7560/ 24424 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 2.26 | learning rate: 1.623E-04 | global batch size: 512 | lm loss: 2.202323E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.096 | TFLOPs: 23.28 | 63: iteration 7570/ 24424 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 2.23 | learning rate: 1.622E-04 | global batch size: 512 | lm loss: 2.200645E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.621 | TFLOPs: 23.64 | 63: iteration 7580/ 24424 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 2.23 | learning rate: 1.621E-04 | global batch size: 512 | lm loss: 2.186646E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.639 | TFLOPs: 23.64 | 63: iteration 7590/ 24424 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 2.24 | learning rate: 1.620E-04 | global batch size: 512 | lm loss: 2.201443E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.074 | TFLOPs: 23.58 | 63: iteration 7600/ 24424 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 2.26 | learning rate: 1.619E-04 | global batch size: 512 | lm loss: 2.203596E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.594 | TFLOPs: 23.33 | 63: iteration 7610/ 24424 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 4.37 | learning rate: 1.618E-04 | global batch size: 512 | lm loss: 2.203478E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 117.032 | TFLOPs: 12.05 | 63: iteration 7620/ 24424 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 2.25 | learning rate: 1.617E-04 | global batch size: 512 | lm loss: 2.200287E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.475 | TFLOPs: 23.42 | 63: iteration 7630/ 24424 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 2.27 | learning rate: 1.616E-04 | global batch size: 512 | lm loss: 2.199664E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.379 | TFLOPs: 23.20 | 63: iteration 7640/ 24424 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 2.32 | learning rate: 1.616E-04 | global batch size: 512 | lm loss: 2.198779E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.344 | TFLOPs: 22.68 | 63: iteration 7650/ 24424 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 2.23 | learning rate: 1.615E-04 | global batch size: 512 | lm loss: 2.185098E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.572 | TFLOPs: 23.63 | 63: iteration 7660/ 24424 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 2.24 | learning rate: 1.614E-04 | global batch size: 512 | lm loss: 2.198035E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.258 | TFLOPs: 23.50 | 63: iteration 7670/ 24424 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 2.28 | learning rate: 1.613E-04 | global batch size: 512 | lm loss: 2.194730E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.019 | TFLOPs: 23.16 | 63: iteration 7680/ 24424 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 2.24 | learning rate: 1.612E-04 | global batch size: 512 | lm loss: 2.195751E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.480 | TFLOPs: 23.52 | 63: iteration 7690/ 24424 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 2.23 | learning rate: 1.611E-04 | global batch size: 512 | lm loss: 2.190024E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.404 | TFLOPs: 23.62 | 63: iteration 7700/ 24424 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 2.26 | learning rate: 1.610E-04 | global batch size: 512 | lm loss: 2.196872E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.620 | TFLOPs: 23.33 | 63: iteration 7710/ 24424 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 2.27 | learning rate: 1.609E-04 | global batch size: 512 | lm loss: 2.218149E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.662 | TFLOPs: 23.23 | 63: iteration 7720/ 24424 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 2.26 | learning rate: 1.608E-04 | global batch size: 512 | lm loss: 2.190726E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.330 | TFLOPs: 23.30 | 63: iteration 7730/ 24424 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 2.23 | learning rate: 1.607E-04 | global batch size: 512 | lm loss: 2.202168E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.753 | TFLOPs: 23.65 | 63: iteration 7740/ 24424 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 2.25 | learning rate: 1.606E-04 | global batch size: 512 | lm loss: 2.201166E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.975 | TFLOPs: 23.47 | 63: iteration 7750/ 24424 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 2.23 | learning rate: 1.605E-04 | global batch size: 512 | lm loss: 2.193665E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.329 | TFLOPs: 23.61 | 63: iteration 7760/ 24424 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 2.25 | learning rate: 1.604E-04 | global batch size: 512 | lm loss: 2.201185E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.217 | TFLOPs: 23.39 | 63: iteration 7770/ 24424 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 2.23 | learning rate: 1.603E-04 | global batch size: 512 | lm loss: 2.211976E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.828 | TFLOPs: 23.66 | 63: iteration 7780/ 24424 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 2.24 | learning rate: 1.602E-04 | global batch size: 512 | lm loss: 2.185200E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.912 | TFLOPs: 23.57 | 63: iteration 7790/ 24424 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 3.75 | learning rate: 1.601E-04 | global batch size: 512 | lm loss: 2.189678E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.606 | TFLOPs: 14.06 | 63: iteration 7800/ 24424 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 2.24 | learning rate: 1.600E-04 | global batch size: 512 | lm loss: 2.208185E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.052 | TFLOPs: 23.58 | 63: iteration 7810/ 24424 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 2.26 | learning rate: 1.599E-04 | global batch size: 512 | lm loss: 2.193131E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.841 | TFLOPs: 23.35 | 63: iteration 7820/ 24424 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 2.26 | learning rate: 1.598E-04 | global batch size: 512 | lm loss: 2.203806E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.699 | TFLOPs: 23.34 | 63: iteration 7830/ 24424 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 2.24 | learning rate: 1.597E-04 | global batch size: 512 | lm loss: 2.199837E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.706 | TFLOPs: 23.54 | 63: iteration 7840/ 24424 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 2.23 | learning rate: 1.596E-04 | global batch size: 512 | lm loss: 2.186798E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.211 | TFLOPs: 23.60 | 63: iteration 7850/ 24424 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 2.25 | learning rate: 1.595E-04 | global batch size: 512 | lm loss: 2.207716E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.347 | TFLOPs: 23.40 | 63: iteration 7860/ 24424 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 2.25 | learning rate: 1.594E-04 | global batch size: 512 | lm loss: 2.210191E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.385 | TFLOPs: 23.41 | 63: iteration 7870/ 24424 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 2.23 | learning rate: 1.593E-04 | global batch size: 512 | lm loss: 2.201787E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.481 | TFLOPs: 23.62 | 63: iteration 7880/ 24424 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 2.26 | learning rate: 1.592E-04 | global batch size: 512 | lm loss: 2.197123E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.763 | TFLOPs: 23.34 | 63: iteration 7890/ 24424 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 2.26 | learning rate: 1.591E-04 | global batch size: 512 | lm loss: 2.188645E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.418 | TFLOPs: 23.31 | 63: iteration 7900/ 24424 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 2.24 | learning rate: 1.590E-04 | global batch size: 512 | lm loss: 2.174693E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.445 | TFLOPs: 23.52 | 63: iteration 7910/ 24424 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 2.23 | learning rate: 1.589E-04 | global batch size: 512 | lm loss: 2.185051E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.962 | TFLOPs: 23.67 | 63: iteration 7920/ 24424 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 2.47 | learning rate: 1.588E-04 | global batch size: 512 | lm loss: 2.199183E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 207.273 | TFLOPs: 21.34 | 63: iteration 7930/ 24424 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 2.40 | learning rate: 1.587E-04 | global batch size: 512 | lm loss: 2.195965E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 212.907 | TFLOPs: 21.92 | 63: iteration 7940/ 24424 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 3.81 | learning rate: 1.586E-04 | global batch size: 512 | lm loss: 2.177958E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 134.428 | TFLOPs: 13.84 | 63: iteration 7950/ 24424 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 2.22 | learning rate: 1.585E-04 | global batch size: 512 | lm loss: 2.194175E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.263 | TFLOPs: 23.70 | 63: iteration 7960/ 24424 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 2.23 | learning rate: 1.584E-04 | global batch size: 512 | lm loss: 2.178703E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.798 | TFLOPs: 23.66 | 63: iteration 7970/ 24424 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 2.23 | learning rate: 1.583E-04 | global batch size: 512 | lm loss: 2.184605E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.703 | TFLOPs: 23.65 | 63: iteration 7980/ 24424 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 2.25 | learning rate: 1.582E-04 | global batch size: 512 | lm loss: 2.187881E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.000 | TFLOPs: 23.47 | 63: iteration 7990/ 24424 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 2.23 | learning rate: 1.581E-04 | global batch size: 512 | lm loss: 2.213297E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.542 | TFLOPs: 23.63 | 0: [2022-11-25 22:46:29,545] [INFO] [logging.py:68:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.00015804673037400077, 0.00015804673037400077, 0.00015804673037400077], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 8000/ 24424 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 2.23 | learning rate: 1.580E-04 | global batch size: 512 | lm loss: 2.206809E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.887 | TFLOPs: 23.67 | 0: steps: 8000 loss: 2.2493 iter time (s): 2.312 samples/sec: 221.494 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 8000 | lm loss value: 2.149882E+00 | lm loss PPL: 8.583846E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 8000 to checkpoints_3b9 0: [2022-11-25 22:46:30,344] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is begin to save! 0: [2022-11-25 22:46:30,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_01-model_00-model_states.pt... 32: [2022-11-25 22:46:30,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_21-model_00-model_states.pt... 32: [2022-11-25 22:46:30,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_21-model_00-model_states.pt. 32: [2022-11-25 22:46:30,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_22-model_00-model_states.pt... 0: [2022-11-25 22:46:30,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_01-model_00-model_states.pt. 0: [2022-11-25 22:46:30,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_03-model_00-model_states.pt... 32: [2022-11-25 22:46:30,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_22-model_00-model_states.pt. 32: [2022-11-25 22:46:30,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_23-model_00-model_states.pt... 0: [2022-11-25 22:46:31,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_03-model_00-model_states.pt. 0: [2022-11-25 22:46:31,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_04-model_00-model_states.pt... 32: [2022-11-25 22:46:31,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_23-model_00-model_states.pt. 32: [2022-11-25 22:46:31,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_24-model_00-model_states.pt... 0: [2022-11-25 22:46:31,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_04-model_00-model_states.pt. 0: [2022-11-25 22:46:31,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_05-model_00-model_states.pt... 32: [2022-11-25 22:46:31,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_24-model_00-model_states.pt. 32: [2022-11-25 22:46:31,284] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_25-model_00-model_states.pt... 0: [2022-11-25 22:46:31,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_05-model_00-model_states.pt. 0: [2022-11-25 22:46:31,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_06-model_00-model_states.pt... 32: [2022-11-25 22:46:31,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_25-model_00-model_states.pt. 32: [2022-11-25 22:46:31,524] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_26-model_00-model_states.pt... 0: [2022-11-25 22:46:31,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_06-model_00-model_states.pt. 0: [2022-11-25 22:46:31,722] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_07-model_00-model_states.pt... 32: [2022-11-25 22:46:31,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_26-model_00-model_states.pt. 32: [2022-11-25 22:46:31,743] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_27-model_00-model_states.pt... 0: [2022-11-25 22:46:31,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_07-model_00-model_states.pt. 0: [2022-11-25 22:46:31,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_08-model_00-model_states.pt... 32: [2022-11-25 22:46:31,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_27-model_00-model_states.pt. 32: [2022-11-25 22:46:31,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_28-model_00-model_states.pt... 32: [2022-11-25 22:46:32,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_28-model_00-model_states.pt. 32: [2022-11-25 22:46:32,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_29-model_00-model_states.pt... 0: [2022-11-25 22:46:32,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_08-model_00-model_states.pt. 0: [2022-11-25 22:46:32,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_09-model_00-model_states.pt... 32: [2022-11-25 22:46:32,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_29-model_00-model_states.pt. 32: [2022-11-25 22:46:32,389] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_30-model_00-model_states.pt... 0: [2022-11-25 22:46:32,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_09-model_00-model_states.pt. 0: [2022-11-25 22:46:32,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_10-model_00-model_states.pt... 32: [2022-11-25 22:46:32,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_30-model_00-model_states.pt. 32: [2022-11-25 22:46:32,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_31-model_00-model_states.pt... 0: [2022-11-25 22:46:32,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_10-model_00-model_states.pt. 0: [2022-11-25 22:46:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_11-model_00-model_states.pt... 32: [2022-11-25 22:46:32,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_31-model_00-model_states.pt. 32: [2022-11-25 22:46:32,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_32-model_00-model_states.pt... 0: [2022-11-25 22:46:32,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_11-model_00-model_states.pt. 0: [2022-11-25 22:46:32,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_12-model_00-model_states.pt... 32: [2022-11-25 22:46:33,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_32-model_00-model_states.pt. 32: [2022-11-25 22:46:33,027] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_33-model_00-model_states.pt... 0: [2022-11-25 22:46:33,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_12-model_00-model_states.pt. 0: [2022-11-25 22:46:33,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_13-model_00-model_states.pt... 32: [2022-11-25 22:46:33,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_33-model_00-model_states.pt. 32: [2022-11-25 22:46:33,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_34-model_00-model_states.pt... 0: [2022-11-25 22:46:33,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_13-model_00-model_states.pt. 0: [2022-11-25 22:46:33,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_14-model_00-model_states.pt... 32: [2022-11-25 22:46:33,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_34-model_00-model_states.pt. 32: [2022-11-25 22:46:33,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_35-model_00-model_states.pt... 0: [2022-11-25 22:46:33,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_14-model_00-model_states.pt. 0: [2022-11-25 22:46:33,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_15-model_00-model_states.pt... 32: [2022-11-25 22:46:33,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_35-model_00-model_states.pt. 32: [2022-11-25 22:46:33,666] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_36-model_00-model_states.pt... 0: [2022-11-25 22:46:33,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_15-model_00-model_states.pt. 0: [2022-11-25 22:46:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_16-model_00-model_states.pt... 32: [2022-11-25 22:46:33,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_36-model_00-model_states.pt. 32: [2022-11-25 22:46:33,878] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_37-model_00-model_states.pt... 0: [2022-11-25 22:46:34,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_16-model_00-model_states.pt. 0: [2022-11-25 22:46:34,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_17-model_00-model_states.pt... 32: [2022-11-25 22:46:34,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_37-model_00-model_states.pt. 32: [2022-11-25 22:46:34,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_38-model_00-model_states.pt... 0: [2022-11-25 22:46:34,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_17-model_00-model_states.pt. 0: [2022-11-25 22:46:34,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_18-model_00-model_states.pt... 32: [2022-11-25 22:46:34,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_38-model_00-model_states.pt. 32: [2022-11-25 22:46:34,533] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_40-model_00-model_states.pt... 32: [2022-11-25 22:46:34,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_40-model_00-model_states.pt. 32: [2022-11-25 22:46:34,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/mp_rank_01_model_states.pt... 32: [2022-11-25 22:46:34,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/mp_rank_01_model_states.pt. 0: [2022-11-25 22:46:34,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_18-model_00-model_states.pt. 0: [2022-11-25 22:46:34,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_19-model_00-model_states.pt... 0: [2022-11-25 22:46:34,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_19-model_00-model_states.pt. 0: [2022-11-25 22:46:34,792] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/layer_20-model_00-model_states.pt... 0: [2022-11-25 22:46:35,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/layer_20-model_00-model_states.pt. 0: [2022-11-25 22:46:35,024] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step8000/mp_rank_00_model_states.pt 0: [2022-11-25 22:46:35,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/mp_rank_00_model_states.pt... 0: [2022-11-25 22:46:35,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/mp_rank_00_model_states.pt. 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 55: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 44: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 36: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 49: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 62: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 30: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 16: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 11: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 25: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 18: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 8: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 1: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 6: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 19: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 27: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 0: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 17: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-25 22:46:35,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 15: [2022-11-25 22:46:35,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 61: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 21: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 61: [2022-11-25 22:46:35,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:46:35,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-25 22:46:35,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:46:35,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 2: [2022-11-25 22:46:35,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 35: [2022-11-25 22:46:35,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 2: [2022-11-25 22:46:35,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 35: [2022-11-25 22:46:35,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 0: [2022-11-25 22:46:35,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 37: [2022-11-25 22:46:35,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:46:35,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 22: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:46:35,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 56: [2022-11-25 22:46:35,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 22: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 22:46:35,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 22:46:35,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:46:35,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 22:46:35,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 22:46:35,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:46:35,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 22:46:35,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 9: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 52: [2022-11-25 22:46:35,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:46:35,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:46:35,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 5: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 22:46:35,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:46:35,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-25 22:46:35,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 44: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 29: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 44: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 47: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 24: [2022-11-25 22:46:35,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 41: [2022-11-25 22:46:35,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 22:46:35,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 5: [2022-11-25 22:46:35,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 43: [2022-11-25 22:46:35,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 5: [2022-11-25 22:46:35,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 43: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 9: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 36: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 21: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 21: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 53: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 2: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 53: [2022-11-25 22:46:35,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 2: [2022-11-25 22:46:35,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 53: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 60: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 53: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:46:35,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 22:46:35,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 22:46:35,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 33: [2022-11-25 22:46:35,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 23: [2022-11-25 22:46:35,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 33: [2022-11-25 22:46:35,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 23: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 13: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 47: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:46:35,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 58: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 24: [2022-11-25 22:46:35,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 22:46:35,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:46:35,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 32: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 29: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:46:35,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 60: [2022-11-25 22:46:35,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:46:35,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 22:46:35,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 53: [2022-11-25 22:46:35,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:46:35,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 3: [2022-11-25 22:46:35,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:46:35,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 5: [2022-11-25 22:46:35,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 58: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 29: [2022-11-25 22:46:35,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 41: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 29: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 0: [2022-11-25 22:46:35,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 19: [2022-11-25 22:46:35,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 62: [2022-11-25 22:46:35,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 46: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:46:35,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:46:35,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 22:46:35,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 46: [2022-11-25 22:46:35,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 20: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 22:46:35,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:46:35,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 11: [2022-11-25 22:46:35,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:46:35,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:46:35,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:46:35,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 22:46:35,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 60: [2022-11-25 22:46:35,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:46:35,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-25 22:46:35,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 7: [2022-11-25 22:46:35,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 11: [2022-11-25 22:46:35,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 11: [2022-11-25 22:46:35,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 61: [2022-11-25 22:46:35,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:46:35,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:46:35,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:46:35,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:46:35,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:46:35,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:46:35,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:46:35,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 22:46:35,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 22:46:35,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:46:35,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:46:35,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:46:35,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:46:35,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 61: [2022-11-25 22:46:35,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:46:35,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 61: [2022-11-25 22:46:35,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:46:35,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:46:35,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-25 22:46:35,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-25 22:46:35,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 60: [2022-11-25 22:46:35,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 22:46:35,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 22:46:35,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 46: [2022-11-25 22:46:35,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:46:35,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 22:46:35,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 34: [2022-11-25 22:46:35,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 29: [2022-11-25 22:46:35,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:46:35,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-25 22:46:35,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:46:35,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 22:46:35,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 7: [2022-11-25 22:46:35,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 53: [2022-11-25 22:46:35,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:46:35,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 22:46:35,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-25 22:46:35,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:46:35,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-25 22:46:35,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 11: [2022-11-25 22:46:35,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:46:35,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 23: [2022-11-25 22:46:35,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 22: [2022-11-25 22:46:35,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 49: [2022-11-25 22:46:35,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:46:35,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 36: [2022-11-25 22:46:35,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:46:35,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 22:46:35,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:46:35,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-25 22:46:35,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:46:35,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 61: [2022-11-25 22:46:35,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:46:35,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:46:35,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 25: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 58: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 57: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 25: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 22:46:35,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 60: [2022-11-25 22:46:35,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:46:35,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 22:46:35,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:46:35,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:46:35,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-25 22:46:35,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:46:35,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 22:46:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 22:46:35,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 33: [2022-11-25 22:46:35,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 26: [2022-11-25 22:46:35,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 22: [2022-11-25 22:46:35,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 33: [2022-11-25 22:46:35,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 22:46:35,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:46:35,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 53: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:46:35,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:46:35,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 41: [2022-11-25 22:46:35,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 5: [2022-11-25 22:46:35,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 41: [2022-11-25 22:46:35,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 5: [2022-11-25 22:46:35,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:46:35,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 22:46:35,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:46:35,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 22:46:35,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:46:35,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-25 22:46:35,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 7: [2022-11-25 22:46:35,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:46:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 22:46:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:46:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-25 22:46:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 61: [2022-11-25 22:46:35,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:46:35,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:46:35,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:46:35,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:46:35,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 22:46:35,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 11: [2022-11-25 22:46:35,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 33: [2022-11-25 22:46:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 60: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 22:46:35,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:46:35,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 22:46:35,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-25 22:46:35,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-25 22:46:35,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 22:46:35,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 53: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:46:35,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 22:46:35,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:46:35,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:46:35,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 40: [2022-11-25 22:46:35,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:46:35,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-25 22:46:35,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-25 22:46:35,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:46:35,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:46:35,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 41: [2022-11-25 22:46:35,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 48: [2022-11-25 22:46:35,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 23: [2022-11-25 22:46:35,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 48: [2022-11-25 22:46:35,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:46:35,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 11: [2022-11-25 22:46:35,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-25 22:46:35,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 7: [2022-11-25 22:46:35,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: [2022-11-25 22:46:35,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 49: [2022-11-25 22:46:35,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 22:46:35,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:46:35,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 22:46:35,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:46:35,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-25 22:46:35,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-25 22:46:35,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-25 22:46:35,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 61: [2022-11-25 22:46:35,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-25 22:46:35,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 44: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 8: [2022-11-25 22:46:35,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 44: [2022-11-25 22:46:35,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 8: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 22:46:35,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 22:46:35,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-25 22:46:35,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:46:35,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 22:46:35,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 22:46:35,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 60: [2022-11-25 22:46:35,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 57: [2022-11-25 22:46:35,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 57: [2022-11-25 22:46:35,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 60: [2022-11-25 22:46:35,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:46:35,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-25 22:46:35,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 9: [2022-11-25 22:46:35,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 28: [2022-11-25 22:46:35,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 22:46:35,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 22:46:35,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 22:46:35,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 22:46:35,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 22:46:35,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 22:46:35,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 22:46:35,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 22:46:35,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 34: [2022-11-25 22:46:35,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-25 22:46:35,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 40: [2022-11-25 22:46:35,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 17: [2022-11-25 22:46:35,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-25 22:46:35,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-25 22:46:35,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 5: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 22:46:35,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 41: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 53: [2022-11-25 22:46:35,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 41: [2022-11-25 22:46:35,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 41: [2022-11-25 22:46:35,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-25 22:46:35,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 62: [2022-11-25 22:46:35,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 22:46:35,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 22:46:35,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 22:46:35,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 22:46:35,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 10: [2022-11-25 22:46:35,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 8: [2022-11-25 22:46:35,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 10: [2022-11-25 22:46:35,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-25 22:46:35,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 8: [2022-11-25 22:46:35,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 22:46:35,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:46:35,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 2: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 22:46:35,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 22: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 22:46:35,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 4: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:46:35,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 4: [2022-11-25 22:46:35,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 1: [2022-11-25 22:46:35,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 49: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 22:46:35,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 30: [2022-11-25 22:46:35,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 59: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 59: [2022-11-25 22:46:35,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 53: [2022-11-25 22:46:35,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 55: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 19: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 55: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 23: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 55: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 19: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 14: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 0: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 32: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 14: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 32: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 15: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 14: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 32: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 31: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 15: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 31: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 15: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 58: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 60: [2022-11-25 22:46:35,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 58: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 60: [2022-11-25 22:46:35,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 13: [2022-11-25 22:46:35,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 60: [2022-11-25 22:46:35,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 13: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 33: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 22:46:35,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 11: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 34: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 11: [2022-11-25 22:46:35,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 34: [2022-11-25 22:46:35,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 22:46:35,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 29: [2022-11-25 22:46:35,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-25 22:46:35,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 50: [2022-11-25 22:46:35,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 22:46:35,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 29: [2022-11-25 22:46:35,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 50: [2022-11-25 22:46:35,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 26: [2022-11-25 22:46:35,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 21: [2022-11-25 22:46:35,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 26: [2022-11-25 22:46:35,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 21: [2022-11-25 22:46:35,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 26: [2022-11-25 22:46:35,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 21: [2022-11-25 22:46:35,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 22:46:35,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 22:46:35,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 42: [2022-11-25 22:46:35,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 22:46:35,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 18: [2022-11-25 22:46:35,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 22:46:35,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-25 22:46:35,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 24: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 43: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 3: [2022-11-25 22:46:35,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 24: [2022-11-25 22:46:35,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 43: [2022-11-25 22:46:35,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 3: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 24: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 36: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 9: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 36: [2022-11-25 22:46:35,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 22:46:35,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 9: [2022-11-25 22:46:35,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 52: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 28: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 61: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 52: [2022-11-25 22:46:35,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 61: [2022-11-25 22:46:35,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 52: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 28: [2022-11-25 22:46:35,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 61: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 28: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 25: [2022-11-25 22:46:35,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 39: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 1: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 39: [2022-11-25 22:46:35,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 27: [2022-11-25 22:46:35,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 1: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 27: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 1: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 20: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 27: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 48: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 48: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 20: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 56: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 38: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 38: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 37: [2022-11-25 22:46:35,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 22:46:35,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 22:46:35,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 7: [2022-11-25 22:46:35,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 57: [2022-11-25 22:46:35,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 7: [2022-11-25 22:46:35,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-25 22:46:35,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 57: [2022-11-25 22:46:35,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 6: [2022-11-25 22:46:35,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 22:46:35,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 22:46:35,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 51: [2022-11-25 22:46:35,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-25 22:46:35,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 22:46:35,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 45: [2022-11-25 22:46:35,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 22:46:35,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 22:46:35,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 3: [2022-11-25 22:46:35,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 22:46:35,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-25 22:46:35,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 63: [2022-11-25 22:46:35,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 22:46:35,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 22:46:35,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 44: [2022-11-25 22:46:35,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 22:46:35,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 22:46:35,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 47: [2022-11-25 22:46:35,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-25 22:46:35,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-25 22:46:35,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 23: [2022-11-25 22:46:35,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 22:46:35,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-25 22:46:35,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 12: [2022-11-25 22:46:35,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 22:46:35,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 22:46:35,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 16: [2022-11-25 22:46:35,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 22:46:35,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 22:46:35,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 43: [2022-11-25 22:46:35,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 43: [2022-11-25 22:46:35,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 22:46:35,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 46: [2022-11-25 22:46:35,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 22:46:35,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 22:46:35,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 35: [2022-11-25 22:46:35,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 22:46:35,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 22:46:35,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 54: [2022-11-25 22:46:35,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-25 22:46:35,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step8000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 22:46:35,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! 0: successfully saved checkpoint at iteration 8000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5392.58 63: iteration 8010/ 24424 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 2.84 | learning rate: 1.579E-04 | global batch size: 512 | lm loss: 2.190056E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.342 | TFLOPs: 18.57 | 63: iteration 8020/ 24424 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 2.24 | learning rate: 1.578E-04 | global batch size: 512 | lm loss: 2.198344E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.692 | TFLOPs: 23.54 | 63: iteration 8030/ 24424 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 2.23 | learning rate: 1.577E-04 | global batch size: 512 | lm loss: 2.190489E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.995 | TFLOPs: 23.68 | 63: iteration 8040/ 24424 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 2.23 | learning rate: 1.577E-04 | global batch size: 512 | lm loss: 2.186124E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.475 | TFLOPs: 23.62 | 63: iteration 8050/ 24424 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 2.25 | learning rate: 1.576E-04 | global batch size: 512 | lm loss: 2.177491E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.526 | TFLOPs: 23.42 | 63: iteration 8060/ 24424 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 2.27 | learning rate: 1.575E-04 | global batch size: 512 | lm loss: 2.204373E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.334 | TFLOPs: 23.20 | 63: iteration 8070/ 24424 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 2.24 | learning rate: 1.574E-04 | global batch size: 512 | lm loss: 2.171112E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.187 | TFLOPs: 23.49 | 63: iteration 8080/ 24424 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 2.26 | learning rate: 1.573E-04 | global batch size: 512 | lm loss: 2.191614E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.714 | TFLOPs: 23.34 | 63: iteration 8090/ 24424 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 3.94 | learning rate: 1.572E-04 | global batch size: 512 | lm loss: 2.187695E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 129.859 | TFLOPs: 13.37 | 63: iteration 8100/ 24424 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 2.24 | learning rate: 1.571E-04 | global batch size: 512 | lm loss: 2.173143E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.406 | TFLOPs: 23.51 | 63: iteration 8110/ 24424 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 2.23 | learning rate: 1.570E-04 | global batch size: 512 | lm loss: 2.172344E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.723 | TFLOPs: 23.65 | 63: iteration 8120/ 24424 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 2.24 | learning rate: 1.569E-04 | global batch size: 512 | lm loss: 2.173929E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.634 | TFLOPs: 23.54 | 63: iteration 8130/ 24424 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 2.23 | learning rate: 1.568E-04 | global batch size: 512 | lm loss: 2.182171E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.347 | TFLOPs: 23.61 | 63: iteration 8140/ 24424 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 2.26 | learning rate: 1.567E-04 | global batch size: 512 | lm loss: 2.169821E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.738 | TFLOPs: 23.34 | 63: iteration 8150/ 24424 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 2.24 | learning rate: 1.566E-04 | global batch size: 512 | lm loss: 2.180562E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.726 | TFLOPs: 23.55 | 63: iteration 8160/ 24424 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 2.23 | learning rate: 1.565E-04 | global batch size: 512 | lm loss: 2.197281E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.546 | TFLOPs: 23.63 | 63: iteration 8170/ 24424 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 2.24 | learning rate: 1.564E-04 | global batch size: 512 | lm loss: 2.179203E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.923 | TFLOPs: 23.57 | 63: iteration 8180/ 24424 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 2.25 | learning rate: 1.563E-04 | global batch size: 512 | lm loss: 2.184419E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.616 | TFLOPs: 23.43 | 63: iteration 8190/ 24424 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 2.23 | learning rate: 1.562E-04 | global batch size: 512 | lm loss: 2.199625E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.609 | TFLOPs: 23.64 | 63: iteration 8200/ 24424 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 2.23 | learning rate: 1.561E-04 | global batch size: 512 | lm loss: 2.189950E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.118 | TFLOPs: 23.59 | 63: iteration 8210/ 24424 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 2.23 | learning rate: 1.560E-04 | global batch size: 512 | lm loss: 2.174012E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.428 | TFLOPs: 23.62 | 63: iteration 8220/ 24424 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 2.25 | learning rate: 1.559E-04 | global batch size: 512 | lm loss: 2.181070E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.256 | TFLOPs: 23.39 | 63: iteration 8230/ 24424 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 2.43 | learning rate: 1.558E-04 | global batch size: 512 | lm loss: 2.169124E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 210.957 | TFLOPs: 21.72 | 63: iteration 8240/ 24424 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 2.24 | learning rate: 1.557E-04 | global batch size: 512 | lm loss: 2.193015E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.034 | TFLOPs: 23.58 | 63: iteration 8250/ 24424 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 3.28 | learning rate: 1.555E-04 | global batch size: 512 | lm loss: 2.180418E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 155.975 | TFLOPs: 16.06 | 63: iteration 8260/ 24424 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 2.23 | learning rate: 1.554E-04 | global batch size: 512 | lm loss: 2.182031E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.714 | TFLOPs: 23.65 | 63: iteration 8270/ 24424 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 2.23 | learning rate: 1.553E-04 | global batch size: 512 | lm loss: 2.179286E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.121 | TFLOPs: 23.59 | 63: iteration 8280/ 24424 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 2.23 | learning rate: 1.552E-04 | global batch size: 512 | lm loss: 2.174596E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.703 | TFLOPs: 23.65 | 63: iteration 8290/ 24424 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 2.23 | learning rate: 1.551E-04 | global batch size: 512 | lm loss: 2.177388E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.589 | TFLOPs: 23.64 | 63: iteration 8300/ 24424 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 2.24 | learning rate: 1.550E-04 | global batch size: 512 | lm loss: 2.171349E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.959 | TFLOPs: 23.57 | 63: iteration 8310/ 24424 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 2.26 | learning rate: 1.549E-04 | global batch size: 512 | lm loss: 2.190063E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.753 | TFLOPs: 23.34 | 63: iteration 8320/ 24424 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 2.23 | learning rate: 1.548E-04 | global batch size: 512 | lm loss: 2.179838E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.189 | TFLOPs: 23.59 | 63: iteration 8330/ 24424 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 2.25 | learning rate: 1.547E-04 | global batch size: 512 | lm loss: 5.805828E+00 | grad norm: 11.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.186 | TFLOPs: 23.39 | 63: iteration 8340/ 24424 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 2.25 | learning rate: 1.546E-04 | global batch size: 512 | lm loss: 8.339335E+00 | grad norm: 1.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.045 | TFLOPs: 23.48 | 63: iteration 8350/ 24424 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 2.25 | learning rate: 1.545E-04 | global batch size: 512 | lm loss: 7.297879E+00 | grad norm: 2.858 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.353 | TFLOPs: 23.40 | 63: iteration 8360/ 24424 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 2.30 | learning rate: 1.544E-04 | global batch size: 512 | lm loss: 6.875623E+00 | grad norm: 1.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.669 | TFLOPs: 22.92 | 63: iteration 8370/ 24424 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 2.23 | learning rate: 1.543E-04 | global batch size: 512 | lm loss: 6.559512E+00 | grad norm: 0.817 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.386 | TFLOPs: 23.61 | 63: iteration 8380/ 24424 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 5.33 | learning rate: 1.542E-04 | global batch size: 512 | lm loss: 6.344512E+00 | grad norm: 0.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 96.121 | TFLOPs: 9.90 | 63: iteration 8390/ 24424 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 3.76 | learning rate: 1.541E-04 | global batch size: 512 | lm loss: 6.089774E+00 | grad norm: 0.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.171 | TFLOPs: 14.02 | 63: iteration 8400/ 24424 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 2.28 | learning rate: 1.540E-04 | global batch size: 512 | lm loss: 5.903983E+00 | grad norm: 0.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.885 | TFLOPs: 23.15 | 63: iteration 8410/ 24424 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 2.23 | learning rate: 1.539E-04 | global batch size: 512 | lm loss: 5.688502E+00 | grad norm: 1.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.287 | TFLOPs: 23.60 | 63: iteration 8420/ 24424 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 3.28 | learning rate: 1.538E-04 | global batch size: 512 | lm loss: 5.520607E+00 | grad norm: 0.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 155.997 | TFLOPs: 16.06 | 63: iteration 8430/ 24424 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 24.61 | learning rate: 1.537E-04 | global batch size: 512 | lm loss: 5.289597E+00 | grad norm: 1.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 20.805 | TFLOPs: 2.14 | 63: iteration 8440/ 24424 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 2.24 | learning rate: 1.536E-04 | global batch size: 512 | lm loss: 4.949991E+00 | grad norm: 1.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.680 | TFLOPs: 23.54 | 63: iteration 8450/ 24424 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 2.49 | learning rate: 1.535E-04 | global batch size: 512 | lm loss: 3.917472E+00 | grad norm: 1.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 205.581 | TFLOPs: 21.16 | 63: iteration 8460/ 24424 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 2.27 | learning rate: 1.534E-04 | global batch size: 512 | lm loss: 3.117004E+00 | grad norm: 0.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.123 | TFLOPs: 23.18 | 63: iteration 8470/ 24424 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 2.26 | learning rate: 1.533E-04 | global batch size: 512 | lm loss: 2.766894E+00 | grad norm: 1.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.952 | TFLOPs: 23.36 | 63: iteration 8480/ 24424 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 2.23 | learning rate: 1.532E-04 | global batch size: 512 | lm loss: 2.601497E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.201 | TFLOPs: 23.60 | 63: iteration 8490/ 24424 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 2.24 | learning rate: 1.531E-04 | global batch size: 512 | lm loss: 2.497244E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.208 | TFLOPs: 23.49 | 63: iteration 8500/ 24424 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 2.25 | learning rate: 1.530E-04 | global batch size: 512 | lm loss: 2.443851E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.364 | TFLOPs: 23.41 | 63: iteration 8510/ 24424 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 2.23 | learning rate: 1.529E-04 | global batch size: 512 | lm loss: 2.354314E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.260 | TFLOPs: 23.60 | 63: iteration 8520/ 24424 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 2.26 | learning rate: 1.528E-04 | global batch size: 512 | lm loss: 2.311697E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.832 | TFLOPs: 23.35 | 63: iteration 8530/ 24424 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 2.23 | learning rate: 1.527E-04 | global batch size: 512 | lm loss: 2.265644E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.576 | TFLOPs: 23.63 | 63: iteration 8540/ 24424 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 2.23 | learning rate: 1.526E-04 | global batch size: 512 | lm loss: 2.261308E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.389 | TFLOPs: 23.61 | 63: iteration 8550/ 24424 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 2.23 | learning rate: 1.525E-04 | global batch size: 512 | lm loss: 2.246091E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.474 | TFLOPs: 23.62 | 63: iteration 8560/ 24424 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 2.25 | learning rate: 1.524E-04 | global batch size: 512 | lm loss: 2.243014E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.343 | TFLOPs: 23.40 | 63: iteration 8570/ 24424 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 2.29 | learning rate: 1.523E-04 | global batch size: 512 | lm loss: 2.213832E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.441 | TFLOPs: 23.00 | 63: iteration 8580/ 24424 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 2.30 | learning rate: 1.522E-04 | global batch size: 512 | lm loss: 2.215109E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.146 | TFLOPs: 22.87 | 63: iteration 8590/ 24424 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 2.27 | learning rate: 1.521E-04 | global batch size: 512 | lm loss: 2.218213E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.855 | TFLOPs: 23.25 | 63: iteration 8600/ 24424 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 2.23 | learning rate: 1.520E-04 | global batch size: 512 | lm loss: 2.200693E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.765 | TFLOPs: 23.65 | 63: iteration 8610/ 24424 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 2.24 | learning rate: 1.519E-04 | global batch size: 512 | lm loss: 2.187010E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.179 | TFLOPs: 23.49 | 63: iteration 8620/ 24424 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 2.24 | learning rate: 1.518E-04 | global batch size: 512 | lm loss: 2.225822E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.221 | TFLOPs: 23.49 | 63: iteration 8630/ 24424 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 2.26 | learning rate: 1.517E-04 | global batch size: 512 | lm loss: 2.196612E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.953 | TFLOPs: 23.36 | 63: iteration 8640/ 24424 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 2.28 | learning rate: 1.516E-04 | global batch size: 512 | lm loss: 2.211287E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.650 | TFLOPs: 23.13 | 63: iteration 8650/ 24424 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 2.25 | learning rate: 1.515E-04 | global batch size: 512 | lm loss: 2.207305E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.846 | TFLOPs: 23.46 | 63: iteration 8660/ 24424 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 2.24 | learning rate: 1.514E-04 | global batch size: 512 | lm loss: 2.182471E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.009 | TFLOPs: 23.58 | 63: iteration 8670/ 24424 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 2.24 | learning rate: 1.512E-04 | global batch size: 512 | lm loss: 2.195090E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.247 | TFLOPs: 23.50 | 63: iteration 8680/ 24424 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 2.23 | learning rate: 1.511E-04 | global batch size: 512 | lm loss: 2.170349E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.347 | TFLOPs: 23.61 | 63: iteration 8690/ 24424 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 2.24 | learning rate: 1.510E-04 | global batch size: 512 | lm loss: 2.186909E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.771 | TFLOPs: 23.55 | 63: iteration 8700/ 24424 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 2.26 | learning rate: 1.509E-04 | global batch size: 512 | lm loss: 2.210819E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.834 | TFLOPs: 23.35 | 63: iteration 8710/ 24424 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 2.24 | learning rate: 1.508E-04 | global batch size: 512 | lm loss: 2.187985E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.588 | TFLOPs: 23.53 | 63: iteration 8720/ 24424 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 2.30 | learning rate: 1.507E-04 | global batch size: 512 | lm loss: 2.189684E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.876 | TFLOPs: 22.94 | 63: iteration 8730/ 24424 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 2.25 | learning rate: 1.506E-04 | global batch size: 512 | lm loss: 2.201008E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.039 | TFLOPs: 23.48 | 63: iteration 8740/ 24424 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 2.26 | learning rate: 1.505E-04 | global batch size: 512 | lm loss: 2.172401E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.119 | TFLOPs: 23.28 | 63: iteration 8750/ 24424 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 2.44 | learning rate: 1.504E-04 | global batch size: 512 | lm loss: 2.186019E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 209.991 | TFLOPs: 21.62 | 63: iteration 8760/ 24424 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 3.21 | learning rate: 1.503E-04 | global batch size: 512 | lm loss: 2.176648E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 159.331 | TFLOPs: 16.40 | 63: iteration 8770/ 24424 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 2.24 | learning rate: 1.502E-04 | global batch size: 512 | lm loss: 2.215556E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.966 | TFLOPs: 23.57 | 63: iteration 8780/ 24424 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 2.25 | learning rate: 1.501E-04 | global batch size: 512 | lm loss: 2.180269E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.902 | TFLOPs: 23.46 | 63: iteration 8790/ 24424 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 2.23 | learning rate: 1.500E-04 | global batch size: 512 | lm loss: 2.181543E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.788 | TFLOPs: 23.66 | 63: iteration 8800/ 24424 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 2.25 | learning rate: 1.499E-04 | global batch size: 512 | lm loss: 2.181734E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.931 | TFLOPs: 23.46 | 63: iteration 8810/ 24424 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 2.23 | learning rate: 1.498E-04 | global batch size: 512 | lm loss: 2.189734E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.908 | TFLOPs: 23.67 | 63: iteration 8820/ 24424 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 2.25 | learning rate: 1.497E-04 | global batch size: 512 | lm loss: 2.164854E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.762 | TFLOPs: 23.45 | 63: iteration 8830/ 24424 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 2.24 | learning rate: 1.496E-04 | global batch size: 512 | lm loss: 2.189432E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.722 | TFLOPs: 23.55 | 63: iteration 8840/ 24424 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 2.26 | learning rate: 1.495E-04 | global batch size: 512 | lm loss: 2.157709E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.498 | TFLOPs: 23.32 | 63: iteration 8850/ 24424 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 2.26 | learning rate: 1.494E-04 | global batch size: 512 | lm loss: 2.194006E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.005 | TFLOPs: 23.37 | 63: iteration 8860/ 24424 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 2.27 | learning rate: 1.493E-04 | global batch size: 512 | lm loss: 2.176324E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.816 | TFLOPs: 23.25 | 63: iteration 8870/ 24424 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 2.27 | learning rate: 1.492E-04 | global batch size: 512 | lm loss: 2.158150E+00 | grad norm: 0.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.118 | TFLOPs: 23.17 | 63: iteration 8880/ 24424 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 3.22 | learning rate: 1.491E-04 | global batch size: 512 | lm loss: 2.222600E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 158.831 | TFLOPs: 16.35 | 63: iteration 8890/ 24424 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 3.12 | learning rate: 1.489E-04 | global batch size: 512 | lm loss: 2.205909E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 163.971 | TFLOPs: 16.88 | 63: iteration 8900/ 24424 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 2.24 | learning rate: 1.488E-04 | global batch size: 512 | lm loss: 2.179106E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.224 | TFLOPs: 23.49 | 63: iteration 8910/ 24424 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 2.30 | learning rate: 1.487E-04 | global batch size: 512 | lm loss: 2.185483E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.457 | TFLOPs: 22.90 | 63: iteration 8920/ 24424 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 2.23 | learning rate: 1.486E-04 | global batch size: 512 | lm loss: 2.182710E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.318 | TFLOPs: 23.61 | 63: iteration 8930/ 24424 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 2.27 | learning rate: 1.485E-04 | global batch size: 512 | lm loss: 2.168816E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.672 | TFLOPs: 23.23 | 63: iteration 8940/ 24424 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 2.24 | learning rate: 1.484E-04 | global batch size: 512 | lm loss: 2.185774E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.161 | TFLOPs: 23.49 | 63: iteration 8950/ 24424 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 2.24 | learning rate: 1.483E-04 | global batch size: 512 | lm loss: 2.185757E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.732 | TFLOPs: 23.55 | 63: iteration 8960/ 24424 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 2.24 | learning rate: 1.482E-04 | global batch size: 512 | lm loss: 2.149207E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.914 | TFLOPs: 23.57 | 63: iteration 8970/ 24424 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 2.25 | learning rate: 1.481E-04 | global batch size: 512 | lm loss: 2.182152E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.556 | TFLOPs: 23.43 | 63: iteration 8980/ 24424 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 2.25 | learning rate: 1.480E-04 | global batch size: 512 | lm loss: 2.172028E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.519 | TFLOPs: 23.42 | 63: iteration 8990/ 24424 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 2.24 | learning rate: 1.479E-04 | global batch size: 512 | lm loss: 2.177617E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.457 | TFLOPs: 23.52 | 63: iteration 9000/ 24424 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 2.25 | learning rate: 1.478E-04 | global batch size: 512 | lm loss: 2.173782E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.514 | TFLOPs: 23.42 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 9000 | lm loss value: 2.105352E+00 | lm loss PPL: 8.209990E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 9000 to checkpoints_3b9 0: [2022-11-25 23:29:45,248] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step9000 is begin to save! 0: [2022-11-25 23:29:45,278] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_01-model_00-model_states.pt... 32: [2022-11-25 23:29:45,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_21-model_00-model_states.pt... 0: [2022-11-25 23:29:45,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_01-model_00-model_states.pt. 0: [2022-11-25 23:29:45,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_03-model_00-model_states.pt... 32: [2022-11-25 23:29:45,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_21-model_00-model_states.pt. 32: [2022-11-25 23:29:45,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_22-model_00-model_states.pt... 0: [2022-11-25 23:29:45,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_03-model_00-model_states.pt. 0: [2022-11-25 23:29:45,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_04-model_00-model_states.pt... 32: [2022-11-25 23:29:45,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_22-model_00-model_states.pt. 32: [2022-11-25 23:29:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_23-model_00-model_states.pt... 0: [2022-11-25 23:29:46,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_04-model_00-model_states.pt. 0: [2022-11-25 23:29:46,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_05-model_00-model_states.pt... 32: [2022-11-25 23:29:46,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_23-model_00-model_states.pt. 32: [2022-11-25 23:29:46,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_24-model_00-model_states.pt... 0: [2022-11-25 23:29:46,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_05-model_00-model_states.pt. 0: [2022-11-25 23:29:46,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_06-model_00-model_states.pt... 32: [2022-11-25 23:29:46,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_24-model_00-model_states.pt. 32: [2022-11-25 23:29:46,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_25-model_00-model_states.pt... 0: [2022-11-25 23:29:46,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_06-model_00-model_states.pt. 0: [2022-11-25 23:29:46,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_07-model_00-model_states.pt... 32: [2022-11-25 23:29:46,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_25-model_00-model_states.pt. 32: [2022-11-25 23:29:46,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_26-model_00-model_states.pt... 0: [2022-11-25 23:29:46,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_07-model_00-model_states.pt. 0: [2022-11-25 23:29:46,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_08-model_00-model_states.pt... 32: [2022-11-25 23:29:46,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_26-model_00-model_states.pt. 32: [2022-11-25 23:29:46,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_27-model_00-model_states.pt... 0: [2022-11-25 23:29:47,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_08-model_00-model_states.pt. 0: [2022-11-25 23:29:47,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_09-model_00-model_states.pt... 32: [2022-11-25 23:29:47,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_27-model_00-model_states.pt. 32: [2022-11-25 23:29:47,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_28-model_00-model_states.pt... 0: [2022-11-25 23:29:47,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_09-model_00-model_states.pt. 0: [2022-11-25 23:29:47,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_10-model_00-model_states.pt... 32: [2022-11-25 23:29:47,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_28-model_00-model_states.pt. 32: [2022-11-25 23:29:47,384] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_29-model_00-model_states.pt... 0: [2022-11-25 23:29:47,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_10-model_00-model_states.pt. 0: [2022-11-25 23:29:47,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_11-model_00-model_states.pt... 32: [2022-11-25 23:29:47,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_29-model_00-model_states.pt. 32: [2022-11-25 23:29:47,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_30-model_00-model_states.pt... 0: [2022-11-25 23:29:47,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_11-model_00-model_states.pt. 0: [2022-11-25 23:29:47,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_12-model_00-model_states.pt... 32: [2022-11-25 23:29:47,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_30-model_00-model_states.pt. 32: [2022-11-25 23:29:47,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_31-model_00-model_states.pt... 0: [2022-11-25 23:29:47,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_12-model_00-model_states.pt. 0: [2022-11-25 23:29:47,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_13-model_00-model_states.pt... 32: [2022-11-25 23:29:48,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_31-model_00-model_states.pt. 32: [2022-11-25 23:29:48,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_32-model_00-model_states.pt... 0: [2022-11-25 23:29:48,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_13-model_00-model_states.pt. 0: [2022-11-25 23:29:48,167] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_14-model_00-model_states.pt... 32: [2022-11-25 23:29:48,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_32-model_00-model_states.pt. 32: [2022-11-25 23:29:48,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_33-model_00-model_states.pt... 0: [2022-11-25 23:29:48,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_14-model_00-model_states.pt. 0: [2022-11-25 23:29:48,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_15-model_00-model_states.pt... 32: [2022-11-25 23:29:48,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_33-model_00-model_states.pt. 32: [2022-11-25 23:29:48,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_34-model_00-model_states.pt... 0: [2022-11-25 23:29:48,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_15-model_00-model_states.pt. 0: [2022-11-25 23:29:48,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_16-model_00-model_states.pt... 32: [2022-11-25 23:29:48,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_34-model_00-model_states.pt. 32: [2022-11-25 23:29:48,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_35-model_00-model_states.pt... 0: [2022-11-25 23:29:48,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_16-model_00-model_states.pt. 0: [2022-11-25 23:29:48,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_17-model_00-model_states.pt... 32: [2022-11-25 23:29:49,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_35-model_00-model_states.pt. 32: [2022-11-25 23:29:49,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_36-model_00-model_states.pt... 0: [2022-11-25 23:29:49,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_17-model_00-model_states.pt. 0: [2022-11-25 23:29:49,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_18-model_00-model_states.pt... 32: [2022-11-25 23:29:49,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_36-model_00-model_states.pt. 32: [2022-11-25 23:29:49,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_37-model_00-model_states.pt... 0: [2022-11-25 23:29:49,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_18-model_00-model_states.pt. 0: [2022-11-25 23:29:49,260] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_19-model_00-model_states.pt... 0: [2022-11-25 23:29:49,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_19-model_00-model_states.pt. 0: [2022-11-25 23:29:49,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_20-model_00-model_states.pt... 32: [2022-11-25 23:29:49,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_37-model_00-model_states.pt. 32: [2022-11-25 23:29:49,486] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_38-model_00-model_states.pt... 0: [2022-11-25 23:29:49,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_20-model_00-model_states.pt. 0: [2022-11-25 23:29:49,697] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step9000/mp_rank_00_model_states.pt 0: [2022-11-25 23:29:49,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/mp_rank_00_model_states.pt... 0: [2022-11-25 23:29:49,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/mp_rank_00_model_states.pt. 32: [2022-11-25 23:29:49,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_38-model_00-model_states.pt. 32: [2022-11-25 23:29:49,710] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/layer_40-model_00-model_states.pt... 32: [2022-11-25 23:29:49,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/layer_40-model_00-model_states.pt. 32: [2022-11-25 23:29:49,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/mp_rank_01_model_states.pt... 32: [2022-11-25 23:29:49,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/mp_rank_01_model_states.pt. 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 51: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 61: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 62: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 35: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 34: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 44: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 36: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 41: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 49: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 15: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 56: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 17: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 12: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 14: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 13: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 23: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 0: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 31: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 6: [2022-11-25 23:29:49,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 21: [2022-11-25 23:29:49,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:49,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:49,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:49,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:49,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:49,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:49,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 27: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 50: [2022-11-25 23:29:49,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 27: [2022-11-25 23:29:49,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 50: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:49,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:49,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:49,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:49,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:49,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-25 23:29:49,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:49,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:49,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:49,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:49,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:49,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 9: [2022-11-25 23:29:49,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 34: [2022-11-25 23:29:49,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:49,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:49,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:49,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:49,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:49,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 34: [2022-11-25 23:29:49,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 9: [2022-11-25 23:29:49,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:49,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:49,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:49,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-25 23:29:49,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:49,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:49,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:49,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:49,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-25 23:29:49,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:49,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:49,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-25 23:29:49,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-25 23:29:49,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:49,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:49,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:49,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:49,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:49,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:49,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:49,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:49,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-25 23:29:49,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:49,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-25 23:29:49,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:49,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:49,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:49,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-25 23:29:49,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:50,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 43: [2022-11-25 23:29:50,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:50,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:50,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-25 23:29:50,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:50,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:50,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:50,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-25 23:29:50,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:50,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:50,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-25 23:29:50,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-25 23:29:50,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:50,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:50,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 36: [2022-11-25 23:29:50,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:50,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-25 23:29:50,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:50,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:50,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:50,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:50,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 6: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 37: [2022-11-25 23:29:50,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 37: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 23:29:50,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:50,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-25 23:29:50,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 43: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 25: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-25 23:29:50,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 43: [2022-11-25 23:29:50,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 25: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:50,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-25 23:29:50,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-25 23:29:50,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:50,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:50,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-25 23:29:50,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-25 23:29:50,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-25 23:29:50,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-25 23:29:50,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-25 23:29:50,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:50,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:50,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:50,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-25 23:29:50,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-25 23:29:50,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:50,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-25 23:29:50,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:50,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:50,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:50,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:50,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:50,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-25 23:29:50,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 18: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 45: [2022-11-25 23:29:50,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 18: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 46: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 18: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 46: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 18: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:50,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-25 23:29:50,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-25 23:29:50,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:50,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-25 23:29:50,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-25 23:29:50,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:50,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:50,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-25 23:29:50,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:50,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:50,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-25 23:29:50,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 16: [2022-11-25 23:29:50,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 57: [2022-11-25 23:29:50,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 16: [2022-11-25 23:29:50,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 57: [2022-11-25 23:29:50,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-25 23:29:50,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 58: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 1: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:50,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-25 23:29:50,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 31: [2022-11-25 23:29:50,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 49: [2022-11-25 23:29:50,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-25 23:29:50,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 31: [2022-11-25 23:29:50,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-25 23:29:50,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-25 23:29:50,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-25 23:29:50,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-25 23:29:50,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 56: [2022-11-25 23:29:50,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 56: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-25 23:29:50,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 48: [2022-11-25 23:29:50,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 48: [2022-11-25 23:29:50,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-25 23:29:50,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-25 23:29:50,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-25 23:29:50,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-25 23:29:50,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 48: [2022-11-25 23:29:50,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-25 23:29:50,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-25 23:29:50,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-25 23:29:50,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 14: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:50,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:50,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:50,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-25 23:29:50,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-25 23:29:50,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-25 23:29:50,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-25 23:29:50,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-25 23:29:50,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-25 23:29:50,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:50,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-25 23:29:50,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-25 23:29:50,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-25 23:29:50,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-25 23:29:50,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-25 23:29:50,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-25 23:29:50,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-25 23:29:50,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 59: [2022-11-25 23:29:50,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 19: [2022-11-25 23:29:50,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 59: [2022-11-25 23:29:50,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 19: [2022-11-25 23:29:50,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-25 23:29:50,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-25 23:29:50,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-25 23:29:50,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:50,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-25 23:29:50,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-25 23:29:50,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-25 23:29:50,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 48: [2022-11-25 23:29:50,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-25 23:29:50,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-25 23:29:50,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 32: [2022-11-25 23:29:50,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 30: [2022-11-25 23:29:50,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:50,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:50,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-25 23:29:50,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:50,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-25 23:29:50,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:50,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:50,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:50,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-25 23:29:50,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:50,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:50,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:50,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-25 23:29:50,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-25 23:29:50,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:50,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 28: [2022-11-25 23:29:50,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:50,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-25 23:29:50,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-25 23:29:50,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:50,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:50,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:50,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 34: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 44: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-25 23:29:50,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-25 23:29:50,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:50,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:50,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-25 23:29:50,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:50,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:50,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-25 23:29:50,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-25 23:29:50,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 49: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 0: [2022-11-25 23:29:50,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 49: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 0: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,257] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,257] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-25 23:29:50,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-25 23:29:50,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 9: [2022-11-25 23:29:50,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-25 23:29:50,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-25 23:29:50,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 57: [2022-11-25 23:29:50,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 6: [2022-11-25 23:29:50,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 57: [2022-11-25 23:29:50,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 6: [2022-11-25 23:29:50,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 39: [2022-11-25 23:29:50,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 1: [2022-11-25 23:29:50,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 39: [2022-11-25 23:29:50,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 1: [2022-11-25 23:29:50,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-25 23:29:50,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:50,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-25 23:29:50,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-25 23:29:50,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:50,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:50,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-25 23:29:50,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-25 23:29:50,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-25 23:29:50,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:50,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:50,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:50,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:50,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-25 23:29:50,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 43: [2022-11-25 23:29:50,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:50,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 3: [2022-11-25 23:29:50,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 43: [2022-11-25 23:29:50,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:50,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-25 23:29:50,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 36: [2022-11-25 23:29:50,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 25: [2022-11-25 23:29:50,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 36: [2022-11-25 23:29:50,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 61: [2022-11-25 23:29:50,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 28: [2022-11-25 23:29:50,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:50,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:50,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-25 23:29:50,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-25 23:29:50,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-25 23:29:50,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 23:29:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 56: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 1: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-25 23:29:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-25 23:29:50,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 41: [2022-11-25 23:29:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 13: [2022-11-25 23:29:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 41: [2022-11-25 23:29:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 13: [2022-11-25 23:29:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-25 23:29:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 54: [2022-11-25 23:29:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 31: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 55: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-25 23:29:50,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-25 23:29:50,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 48: [2022-11-25 23:29:50,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-25 23:29:50,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 32: [2022-11-25 23:29:50,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 30: [2022-11-25 23:29:50,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-25 23:29:50,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-25 23:29:50,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-25 23:29:50,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-25 23:29:50,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 15: [2022-11-25 23:29:50,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-25 23:29:50,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:50,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:50,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-25 23:29:50,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-25 23:29:50,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-25 23:29:50,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-25 23:29:50,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:50,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:50,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-25 23:29:50,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-25 23:29:50,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-25 23:29:50,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:50,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:50,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:50,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-25 23:29:50,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-25 23:29:50,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-25 23:29:50,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-25 23:29:50,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: [2022-11-25 23:29:50,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-25 23:29:50,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-25 23:29:50,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 61: [2022-11-25 23:29:50,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-25 23:29:50,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-25 23:29:50,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-25 23:29:50,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 11: [2022-11-25 23:29:50,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-25 23:29:50,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-25 23:29:50,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 17: [2022-11-25 23:29:50,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-25 23:29:50,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-25 23:29:50,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:50,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:50,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-25 23:29:50,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-25 23:29:50,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-25 23:29:50,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 55: [2022-11-25 23:29:50,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 20: [2022-11-25 23:29:50,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 10: [2022-11-25 23:29:50,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-25 23:29:50,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-25 23:29:50,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 16: [2022-11-25 23:29:50,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-25 23:29:50,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-25 23:29:50,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 28: [2022-11-25 23:29:50,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 42: [2022-11-25 23:29:50,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 45: [2022-11-25 23:29:50,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-25 23:29:50,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-25 23:29:50,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 57: [2022-11-25 23:29:50,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-25 23:29:50,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-25 23:29:50,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-25 23:29:50,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 6: [2022-11-25 23:29:50,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-25 23:29:50,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-25 23:29:50,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-25 23:29:50,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-25 23:29:50,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 54: [2022-11-25 23:29:50,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-25 23:29:50,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-25 23:29:50,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 62: [2022-11-25 23:29:50,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-25 23:29:50,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-25 23:29:50,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 63: [2022-11-25 23:29:50,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-25 23:29:50,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-25 23:29:50,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-25 23:29:50,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-25 23:29:50,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 40: [2022-11-25 23:29:50,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-25 23:29:50,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-25 23:29:50,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-25 23:29:50,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 58: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 22: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 7: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 2: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 2: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 2: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 31: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 12: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 12: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-25 23:29:50,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 48: [2022-11-25 23:29:50,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 32: [2022-11-25 23:29:50,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-25 23:29:50,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-25 23:29:50,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 37: [2022-11-25 23:29:50,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 26: [2022-11-25 23:29:50,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 37: [2022-11-25 23:29:50,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 26: [2022-11-25 23:29:50,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 37: [2022-11-25 23:29:50,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 47: [2022-11-25 23:29:50,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-25 23:29:50,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 55: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-25 23:29:50,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 43: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-25 23:29:50,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-25 23:29:50,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:50,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 4: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 50: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 4: [2022-11-25 23:29:50,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 53: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-25 23:29:50,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 4: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 53: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 36: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-25 23:29:50,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-25 23:29:50,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 21: [2022-11-25 23:29:50,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-25 23:29:50,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-25 23:29:50,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 34: [2022-11-25 23:29:50,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-25 23:29:50,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-25 23:29:50,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 52: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-25 23:29:50,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 19: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 19: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 9: [2022-11-25 23:29:50,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 13: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 8: [2022-11-25 23:29:50,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 13: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 7: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 8: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 13: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 8: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 14: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 14: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 42: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 42: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 33: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 48: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 42: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 48: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 33: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 44: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 60: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-25 23:29:50,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 29: [2022-11-25 23:29:50,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 60: [2022-11-25 23:29:50,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 29: [2022-11-25 23:29:50,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-25 23:29:50,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 23: [2022-11-25 23:29:50,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-25 23:29:50,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-25 23:29:50,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 30: [2022-11-25 23:29:50,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-25 23:29:50,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 15: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 15: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 56: [2022-11-25 23:29:50,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 27: [2022-11-25 23:29:50,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 1: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-25 23:29:50,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-25 23:29:50,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 24: [2022-11-25 23:29:50,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-25 23:29:50,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-25 23:29:50,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 5: [2022-11-25 23:29:50,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-25 23:29:50,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-25 23:29:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 3: [2022-11-25 23:29:50,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-25 23:29:50,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-25 23:29:50,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 20: [2022-11-25 23:29:50,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-25 23:29:50,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-25 23:29:50,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 25: [2022-11-25 23:29:50,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-25 23:29:50,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-25 23:29:50,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 51: [2022-11-25 23:29:50,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-25 23:29:50,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-25 23:29:50,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 38: [2022-11-25 23:29:50,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-25 23:29:50,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-25 23:29:50,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-25 23:29:50,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 35: [2022-11-25 23:29:50,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-25 23:29:50,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 59: [2022-11-25 23:29:50,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 35: [2022-11-25 23:29:50,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 59: [2022-11-25 23:29:50,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 26: [2022-11-25 23:29:50,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-25 23:29:50,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-25 23:29:50,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 28: [2022-11-25 23:29:50,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-25 23:29:50,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-25 23:29:50,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 39: [2022-11-25 23:29:50,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-25 23:29:50,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-25 23:29:50,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 46: [2022-11-25 23:29:50,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-25 23:29:50,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-25 23:29:50,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 18: [2022-11-25 23:29:50,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-25 23:29:50,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-25 23:29:50,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 50: [2022-11-25 23:29:50,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-25 23:29:50,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-25 23:29:50,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 49: [2022-11-25 23:29:50,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-25 23:29:50,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-25 23:29:50,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 41: [2022-11-25 23:29:50,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-25 23:29:50,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step9000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-25 23:29:50,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! 0: successfully saved checkpoint at iteration 9000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5206.97 63: iteration 9010/ 24424 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 2.83 | learning rate: 1.477E-04 | global batch size: 512 | lm loss: 2.187927E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.636 | TFLOPs: 18.60 | 63: iteration 9020/ 24424 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 2.24 | learning rate: 1.476E-04 | global batch size: 512 | lm loss: 2.163713E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.481 | TFLOPs: 23.52 | 63: iteration 9030/ 24424 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 2.23 | learning rate: 1.475E-04 | global batch size: 512 | lm loss: 2.160458E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.110 | TFLOPs: 23.59 | 63: iteration 9040/ 24424 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 4.02 | learning rate: 1.474E-04 | global batch size: 512 | lm loss: 2.169424E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 127.345 | TFLOPs: 13.11 | 63: iteration 9050/ 24424 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 2.27 | learning rate: 1.473E-04 | global batch size: 512 | lm loss: 2.174189E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.512 | TFLOPs: 23.22 | 63: iteration 9060/ 24424 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 2.27 | learning rate: 1.471E-04 | global batch size: 512 | lm loss: 2.196357E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.874 | TFLOPs: 23.25 | 63: iteration 9070/ 24424 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 2.26 | learning rate: 1.470E-04 | global batch size: 512 | lm loss: 2.168727E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.510 | TFLOPs: 23.32 | 63: iteration 9080/ 24424 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 2.23 | learning rate: 1.469E-04 | global batch size: 512 | lm loss: 2.135779E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.717 | TFLOPs: 23.65 | 63: iteration 9090/ 24424 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 2.25 | learning rate: 1.468E-04 | global batch size: 512 | lm loss: 2.153856E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.237 | TFLOPs: 23.39 | 63: iteration 9100/ 24424 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 2.31 | learning rate: 1.467E-04 | global batch size: 512 | lm loss: 2.166791E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.478 | TFLOPs: 22.80 | 63: iteration 9110/ 24424 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 2.25 | learning rate: 1.466E-04 | global batch size: 512 | lm loss: 2.161992E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.939 | TFLOPs: 23.47 | 63: iteration 9120/ 24424 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 2.26 | learning rate: 1.465E-04 | global batch size: 512 | lm loss: 2.146847E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.937 | TFLOPs: 23.36 | 63: iteration 9130/ 24424 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 2.23 | learning rate: 1.464E-04 | global batch size: 512 | lm loss: 2.148673E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.823 | TFLOPs: 23.66 | 63: iteration 9140/ 24424 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 2.24 | learning rate: 1.463E-04 | global batch size: 512 | lm loss: 2.165430E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.787 | TFLOPs: 23.55 | 63: iteration 9150/ 24424 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 2.25 | learning rate: 1.462E-04 | global batch size: 512 | lm loss: 2.173766E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.861 | TFLOPs: 23.46 | 63: iteration 9160/ 24424 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 2.24 | learning rate: 1.461E-04 | global batch size: 512 | lm loss: 2.169388E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.025 | TFLOPs: 23.58 | 63: iteration 9170/ 24424 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 2.27 | learning rate: 1.460E-04 | global batch size: 512 | lm loss: 2.167441E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.737 | TFLOPs: 23.24 | 63: iteration 9180/ 24424 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 2.23 | learning rate: 1.459E-04 | global batch size: 512 | lm loss: 2.155265E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.105 | TFLOPs: 23.59 | 63: iteration 9190/ 24424 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 2.24 | learning rate: 1.458E-04 | global batch size: 512 | lm loss: 2.181724E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.606 | TFLOPs: 23.53 | 63: iteration 9200/ 24424 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 2.54 | learning rate: 1.456E-04 | global batch size: 512 | lm loss: 2.170252E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 201.364 | TFLOPs: 20.73 | 63: iteration 9210/ 24424 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 2.23 | learning rate: 1.455E-04 | global batch size: 512 | lm loss: 2.146734E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.633 | TFLOPs: 23.64 | 63: iteration 9220/ 24424 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 2.23 | learning rate: 1.454E-04 | global batch size: 512 | lm loss: 2.178718E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.672 | TFLOPs: 23.64 | 63: iteration 9230/ 24424 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 2.28 | learning rate: 1.453E-04 | global batch size: 512 | lm loss: 2.138638E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.030 | TFLOPs: 23.17 | 63: iteration 9240/ 24424 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 2.25 | learning rate: 1.452E-04 | global batch size: 512 | lm loss: 2.145752E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.466 | TFLOPs: 23.42 | 63: iteration 9250/ 24424 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 2.23 | learning rate: 1.451E-04 | global batch size: 512 | lm loss: 2.161603E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.608 | TFLOPs: 23.64 | 63: iteration 9260/ 24424 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 2.24 | learning rate: 1.450E-04 | global batch size: 512 | lm loss: 2.142574E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.631 | TFLOPs: 23.54 | 63: iteration 9270/ 24424 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 2.26 | learning rate: 1.449E-04 | global batch size: 512 | lm loss: 2.165634E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.914 | TFLOPs: 23.36 | 63: iteration 9280/ 24424 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 2.25 | learning rate: 1.448E-04 | global batch size: 512 | lm loss: 2.172744E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.940 | TFLOPs: 23.47 | 63: iteration 9290/ 24424 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 2.25 | learning rate: 1.447E-04 | global batch size: 512 | lm loss: 2.151206E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.685 | TFLOPs: 23.44 | 63: iteration 9300/ 24424 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 2.24 | learning rate: 1.446E-04 | global batch size: 512 | lm loss: 2.149312E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.546 | TFLOPs: 23.53 | 63: iteration 9310/ 24424 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 2.26 | learning rate: 1.445E-04 | global batch size: 512 | lm loss: 2.162302E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.560 | TFLOPs: 23.32 | 63: iteration 9320/ 24424 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 2.25 | learning rate: 1.444E-04 | global batch size: 512 | lm loss: 2.163214E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.178 | TFLOPs: 23.39 | 63: iteration 9330/ 24424 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 2.25 | learning rate: 1.442E-04 | global batch size: 512 | lm loss: 2.168065E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.807 | TFLOPs: 23.45 | 63: iteration 9340/ 24424 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 2.23 | learning rate: 1.441E-04 | global batch size: 512 | lm loss: 2.167594E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.104 | TFLOPs: 23.59 | 63: iteration 9350/ 24424 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 2.31 | learning rate: 1.440E-04 | global batch size: 512 | lm loss: 2.157764E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.783 | TFLOPs: 22.83 | 63: iteration 9360/ 24424 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 2.26 | learning rate: 1.439E-04 | global batch size: 512 | lm loss: 2.164140E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.939 | TFLOPs: 23.36 | 63: iteration 9370/ 24424 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 2.23 | learning rate: 1.438E-04 | global batch size: 512 | lm loss: 2.158094E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.729 | TFLOPs: 23.65 | 63: iteration 9380/ 24424 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 2.25 | learning rate: 1.437E-04 | global batch size: 512 | lm loss: 2.159772E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.860 | TFLOPs: 23.46 | 63: iteration 9390/ 24424 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 2.23 | learning rate: 1.436E-04 | global batch size: 512 | lm loss: 2.145806E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.277 | TFLOPs: 23.60 | 63: iteration 9400/ 24424 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 2.24 | learning rate: 1.435E-04 | global batch size: 512 | lm loss: 2.156976E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.895 | TFLOPs: 23.56 | 63: iteration 9410/ 24424 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 2.26 | learning rate: 1.434E-04 | global batch size: 512 | lm loss: 2.144506E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.913 | TFLOPs: 23.36 | 63: iteration 9420/ 24424 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 2.24 | learning rate: 1.433E-04 | global batch size: 512 | lm loss: 2.139914E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.078 | TFLOPs: 23.48 | 63: iteration 9430/ 24424 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 2.25 | learning rate: 1.432E-04 | global batch size: 512 | lm loss: 2.148978E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.119 | TFLOPs: 23.38 | 63: iteration 9440/ 24424 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 2.24 | learning rate: 1.431E-04 | global batch size: 512 | lm loss: 2.154644E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.047 | TFLOPs: 23.58 | 63: iteration 9450/ 24424 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 2.23 | learning rate: 1.429E-04 | global batch size: 512 | lm loss: 2.157435E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.257 | TFLOPs: 23.60 | 63: iteration 9460/ 24424 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 2.23 | learning rate: 1.428E-04 | global batch size: 512 | lm loss: 2.145089E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.683 | TFLOPs: 23.64 | 63: iteration 9470/ 24424 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 2.24 | learning rate: 1.427E-04 | global batch size: 512 | lm loss: 2.169381E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.088 | TFLOPs: 23.48 | 63: iteration 9480/ 24424 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 2.23 | learning rate: 1.426E-04 | global batch size: 512 | lm loss: 2.135665E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.297 | TFLOPs: 23.61 | 63: iteration 9490/ 24424 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 2.23 | learning rate: 1.425E-04 | global batch size: 512 | lm loss: 2.170382E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.220 | TFLOPs: 23.60 | 63: iteration 9500/ 24424 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 2.24 | learning rate: 1.424E-04 | global batch size: 512 | lm loss: 2.166728E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.498 | TFLOPs: 23.52 | 63: iteration 9510/ 24424 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 2.24 | learning rate: 1.423E-04 | global batch size: 512 | lm loss: 2.171986E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.257 | TFLOPs: 23.50 | 63: iteration 9520/ 24424 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 2.28 | learning rate: 1.422E-04 | global batch size: 512 | lm loss: 2.154097E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.556 | TFLOPs: 23.12 | 63: iteration 9530/ 24424 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 2.23 | learning rate: 1.421E-04 | global batch size: 512 | lm loss: 2.141597E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.281 | TFLOPs: 23.60 | 63: iteration 9540/ 24424 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 2.87 | learning rate: 1.420E-04 | global batch size: 512 | lm loss: 2.144971E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.329 | TFLOPs: 18.36 | 63: iteration 9550/ 24424 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 2.24 | learning rate: 1.419E-04 | global batch size: 512 | lm loss: 2.178647E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.396 | TFLOPs: 23.51 | 63: iteration 9560/ 24424 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 2.23 | learning rate: 1.417E-04 | global batch size: 512 | lm loss: 2.164188E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.229 | TFLOPs: 23.60 | 63: iteration 9570/ 24424 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 2.26 | learning rate: 1.416E-04 | global batch size: 512 | lm loss: 2.139429E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.492 | TFLOPs: 23.32 | 63: iteration 9580/ 24424 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 2.23 | learning rate: 1.415E-04 | global batch size: 512 | lm loss: 2.149752E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.144 | TFLOPs: 23.59 | 63: iteration 9590/ 24424 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 2.25 | learning rate: 1.414E-04 | global batch size: 512 | lm loss: 2.147614E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.497 | TFLOPs: 23.42 | 63: iteration 9600/ 24424 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 2.23 | learning rate: 1.413E-04 | global batch size: 512 | lm loss: 2.156587E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.090 | TFLOPs: 23.69 | 63: iteration 9610/ 24424 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 2.23 | learning rate: 1.412E-04 | global batch size: 512 | lm loss: 2.149124E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.733 | TFLOPs: 23.65 | 63: iteration 9620/ 24424 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 2.27 | learning rate: 1.411E-04 | global batch size: 512 | lm loss: 2.154089E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.748 | TFLOPs: 23.24 | 63: iteration 9630/ 24424 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 2.29 | learning rate: 1.410E-04 | global batch size: 512 | lm loss: 2.161029E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.453 | TFLOPs: 23.00 | 63: iteration 9640/ 24424 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 2.24 | learning rate: 1.409E-04 | global batch size: 512 | lm loss: 2.133604E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.063 | TFLOPs: 23.58 | 63: iteration 9650/ 24424 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 2.25 | learning rate: 1.408E-04 | global batch size: 512 | lm loss: 2.151845E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.726 | TFLOPs: 23.44 | 63: iteration 9660/ 24424 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 2.23 | learning rate: 1.406E-04 | global batch size: 512 | lm loss: 2.134304E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.847 | TFLOPs: 23.66 | 63: iteration 9670/ 24424 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 2.23 | learning rate: 1.405E-04 | global batch size: 512 | lm loss: 2.138516E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.511 | TFLOPs: 23.63 | 63: iteration 9680/ 24424 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 2.23 | learning rate: 1.404E-04 | global batch size: 512 | lm loss: 2.139870E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.656 | TFLOPs: 23.64 | 63: iteration 9690/ 24424 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 2.27 | learning rate: 1.403E-04 | global batch size: 512 | lm loss: 2.132760E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.909 | TFLOPs: 23.26 | 63: iteration 9700/ 24424 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 2.53 | learning rate: 1.402E-04 | global batch size: 512 | lm loss: 2.149873E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 202.611 | TFLOPs: 20.86 | 63: iteration 9710/ 24424 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 2.27 | learning rate: 1.401E-04 | global batch size: 512 | lm loss: 2.154015E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.530 | TFLOPs: 23.22 | 63: iteration 9720/ 24424 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 2.24 | learning rate: 1.400E-04 | global batch size: 512 | lm loss: 2.158551E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.479 | TFLOPs: 23.52 | 63: iteration 9730/ 24424 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 2.23 | learning rate: 1.399E-04 | global batch size: 512 | lm loss: 2.151786E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.518 | TFLOPs: 23.63 | 63: iteration 9740/ 24424 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 2.24 | learning rate: 1.398E-04 | global batch size: 512 | lm loss: 2.127866E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.608 | TFLOPs: 23.53 | 63: iteration 9750/ 24424 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 2.23 | learning rate: 1.397E-04 | global batch size: 512 | lm loss: 2.141270E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.434 | TFLOPs: 23.62 | 63: iteration 9760/ 24424 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 2.25 | learning rate: 1.395E-04 | global batch size: 512 | lm loss: 2.157483E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.763 | TFLOPs: 23.45 | 63: iteration 9770/ 24424 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 2.25 | learning rate: 1.394E-04 | global batch size: 512 | lm loss: 2.139104E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.504 | TFLOPs: 23.42 | 63: iteration 9780/ 24424 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 5.59 | learning rate: 1.393E-04 | global batch size: 512 | lm loss: 2.155025E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 91.609 | TFLOPs: 9.43 | 63: iteration 9790/ 24424 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 2.23 | learning rate: 1.392E-04 | global batch size: 512 | lm loss: 2.138725E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.307 | TFLOPs: 23.61 | 63: iteration 9800/ 24424 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 2.25 | learning rate: 1.391E-04 | global batch size: 512 | lm loss: 2.121526E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.015 | TFLOPs: 23.47 | 63: iteration 9810/ 24424 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 2.23 | learning rate: 1.390E-04 | global batch size: 512 | lm loss: 2.150859E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.172 | TFLOPs: 23.59 | 63: iteration 9820/ 24424 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 2.29 | learning rate: 1.389E-04 | global batch size: 512 | lm loss: 2.159329E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.236 | TFLOPs: 22.98 | 63: iteration 9830/ 24424 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 2.24 | learning rate: 1.388E-04 | global batch size: 512 | lm loss: 2.130918E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.273 | TFLOPs: 23.50 | 63: iteration 9840/ 24424 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 2.24 | learning rate: 1.387E-04 | global batch size: 512 | lm loss: 2.127263E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.286 | TFLOPs: 23.50 | 63: iteration 9850/ 24424 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 2.45 | learning rate: 1.385E-04 | global batch size: 512 | lm loss: 2.132923E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 209.065 | TFLOPs: 21.52 | 63: iteration 9860/ 24424 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 2.23 | learning rate: 1.384E-04 | global batch size: 512 | lm loss: 2.125175E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.510 | TFLOPs: 23.63 | 63: iteration 9870/ 24424 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 2.24 | learning rate: 1.383E-04 | global batch size: 512 | lm loss: 2.150908E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.984 | TFLOPs: 23.57 | 63: iteration 9880/ 24424 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 2.23 | learning rate: 1.382E-04 | global batch size: 512 | lm loss: 2.138596E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.990 | TFLOPs: 23.68 | 63: iteration 9890/ 24424 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 2.26 | learning rate: 1.381E-04 | global batch size: 512 | lm loss: 2.146095E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.076 | TFLOPs: 23.27 | 63: iteration 9900/ 24424 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 2.23 | learning rate: 1.380E-04 | global batch size: 512 | lm loss: 2.128313E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.380 | TFLOPs: 23.61 | 63: iteration 9910/ 24424 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 2.23 | learning rate: 1.379E-04 | global batch size: 512 | lm loss: 2.144780E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.459 | TFLOPs: 23.62 | 63: iteration 9920/ 24424 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 2.24 | learning rate: 1.378E-04 | global batch size: 512 | lm loss: 2.118746E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.372 | TFLOPs: 23.51 | 63: iteration 9930/ 24424 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 2.25 | learning rate: 1.377E-04 | global batch size: 512 | lm loss: 2.155293E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.949 | TFLOPs: 23.47 | 63: iteration 9940/ 24424 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 2.25 | learning rate: 1.375E-04 | global batch size: 512 | lm loss: 2.148979E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.756 | TFLOPs: 23.45 | 63: iteration 9950/ 24424 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 2.27 | learning rate: 1.374E-04 | global batch size: 512 | lm loss: 2.139668E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.039 | TFLOPs: 23.27 | 63: iteration 9960/ 24424 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 2.28 | learning rate: 1.373E-04 | global batch size: 512 | lm loss: 2.160516E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.967 | TFLOPs: 23.16 | 63: iteration 9970/ 24424 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 2.25 | learning rate: 1.372E-04 | global batch size: 512 | lm loss: 2.134497E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.441 | TFLOPs: 23.41 | 63: iteration 9980/ 24424 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 2.24 | learning rate: 1.371E-04 | global batch size: 512 | lm loss: 2.115838E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.220 | TFLOPs: 23.49 | 63: iteration 9990/ 24424 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 2.23 | learning rate: 1.370E-04 | global batch size: 512 | lm loss: 2.154679E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.503 | TFLOPs: 23.63 | 0: [2022-11-26 00:08:21,846] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=0, lr=[0.00013688086445850332, 0.00013688086445850332, 0.00013688086445850332], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 10000/ 24424 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 2.23 | learning rate: 1.369E-04 | global batch size: 512 | lm loss: 2.159416E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.827 | TFLOPs: 23.66 | 0: steps: 10000 loss: 2.1400 iter time (s): 2.449 samples/sec: 209.044 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 10000 | lm loss value: 2.078765E+00 | lm loss PPL: 7.994593E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 10000 to checkpoints_3b9 0: [2022-11-26 00:08:22,594] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is begin to save! 0: [2022-11-26 00:08:22,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_01-model_00-model_states.pt... 32: [2022-11-26 00:08:22,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_21-model_00-model_states.pt... 32: [2022-11-26 00:08:22,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_21-model_00-model_states.pt. 32: [2022-11-26 00:08:22,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_22-model_00-model_states.pt... 0: [2022-11-26 00:08:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_01-model_00-model_states.pt. 0: [2022-11-26 00:08:23,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_03-model_00-model_states.pt... 32: [2022-11-26 00:08:23,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_22-model_00-model_states.pt. 32: [2022-11-26 00:08:23,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_23-model_00-model_states.pt... 0: [2022-11-26 00:08:23,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_03-model_00-model_states.pt. 0: [2022-11-26 00:08:23,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_04-model_00-model_states.pt... 32: [2022-11-26 00:08:23,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_23-model_00-model_states.pt. 32: [2022-11-26 00:08:23,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_24-model_00-model_states.pt... 0: [2022-11-26 00:08:23,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_04-model_00-model_states.pt. 0: [2022-11-26 00:08:23,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_05-model_00-model_states.pt... 32: [2022-11-26 00:08:23,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_24-model_00-model_states.pt. 32: [2022-11-26 00:08:23,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_25-model_00-model_states.pt... 0: [2022-11-26 00:08:23,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_05-model_00-model_states.pt. 0: [2022-11-26 00:08:23,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_06-model_00-model_states.pt... 32: [2022-11-26 00:08:23,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_25-model_00-model_states.pt. 32: [2022-11-26 00:08:23,768] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_26-model_00-model_states.pt... 0: [2022-11-26 00:08:23,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_06-model_00-model_states.pt. 0: [2022-11-26 00:08:23,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_07-model_00-model_states.pt... 32: [2022-11-26 00:08:23,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_26-model_00-model_states.pt. 32: [2022-11-26 00:08:23,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_27-model_00-model_states.pt... 0: [2022-11-26 00:08:24,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_07-model_00-model_states.pt. 0: [2022-11-26 00:08:24,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_08-model_00-model_states.pt... 32: [2022-11-26 00:08:24,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_27-model_00-model_states.pt. 32: [2022-11-26 00:08:24,223] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_28-model_00-model_states.pt... 0: [2022-11-26 00:08:24,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_08-model_00-model_states.pt. 0: [2022-11-26 00:08:24,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_09-model_00-model_states.pt... 32: [2022-11-26 00:08:24,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_28-model_00-model_states.pt. 32: [2022-11-26 00:08:24,445] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_29-model_00-model_states.pt... 0: [2022-11-26 00:08:24,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_09-model_00-model_states.pt. 0: [2022-11-26 00:08:24,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_10-model_00-model_states.pt... 32: [2022-11-26 00:08:24,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_29-model_00-model_states.pt. 32: [2022-11-26 00:08:24,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_30-model_00-model_states.pt... 0: [2022-11-26 00:08:24,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_10-model_00-model_states.pt. 0: [2022-11-26 00:08:24,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_11-model_00-model_states.pt... 32: [2022-11-26 00:08:24,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_30-model_00-model_states.pt. 32: [2022-11-26 00:08:24,895] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_31-model_00-model_states.pt... 32: [2022-11-26 00:08:25,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_31-model_00-model_states.pt. 32: [2022-11-26 00:08:25,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_32-model_00-model_states.pt... 0: [2022-11-26 00:08:25,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_11-model_00-model_states.pt. 0: [2022-11-26 00:08:25,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_12-model_00-model_states.pt... 32: [2022-11-26 00:08:25,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_32-model_00-model_states.pt. 32: [2022-11-26 00:08:25,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_33-model_00-model_states.pt... 0: [2022-11-26 00:08:25,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_12-model_00-model_states.pt. 0: [2022-11-26 00:08:25,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_13-model_00-model_states.pt... 32: [2022-11-26 00:08:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_33-model_00-model_states.pt. 32: [2022-11-26 00:08:25,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_34-model_00-model_states.pt... 0: [2022-11-26 00:08:25,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_13-model_00-model_states.pt. 0: [2022-11-26 00:08:25,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_14-model_00-model_states.pt... 0: [2022-11-26 00:08:25,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_14-model_00-model_states.pt. 0: [2022-11-26 00:08:25,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_15-model_00-model_states.pt... 32: [2022-11-26 00:08:25,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_34-model_00-model_states.pt. 32: [2022-11-26 00:08:25,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_35-model_00-model_states.pt... 0: [2022-11-26 00:08:26,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_15-model_00-model_states.pt. 0: [2022-11-26 00:08:26,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_16-model_00-model_states.pt... 32: [2022-11-26 00:08:26,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_35-model_00-model_states.pt. 32: [2022-11-26 00:08:26,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_36-model_00-model_states.pt... 0: [2022-11-26 00:08:26,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_16-model_00-model_states.pt. 0: [2022-11-26 00:08:26,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_17-model_00-model_states.pt... 32: [2022-11-26 00:08:26,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_36-model_00-model_states.pt. 32: [2022-11-26 00:08:26,375] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_37-model_00-model_states.pt... 0: [2022-11-26 00:08:26,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_17-model_00-model_states.pt. 0: [2022-11-26 00:08:26,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_18-model_00-model_states.pt... 32: [2022-11-26 00:08:26,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_37-model_00-model_states.pt. 32: [2022-11-26 00:08:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_38-model_00-model_states.pt... 0: [2022-11-26 00:08:26,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_18-model_00-model_states.pt. 0: [2022-11-26 00:08:26,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_19-model_00-model_states.pt... 32: [2022-11-26 00:08:26,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_38-model_00-model_states.pt. 32: [2022-11-26 00:08:26,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_40-model_00-model_states.pt... 32: [2022-11-26 00:08:26,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_40-model_00-model_states.pt. 32: [2022-11-26 00:08:26,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/mp_rank_01_model_states.pt... 32: [2022-11-26 00:08:26,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/mp_rank_01_model_states.pt. 0: [2022-11-26 00:08:26,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_19-model_00-model_states.pt. 0: [2022-11-26 00:08:26,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/layer_20-model_00-model_states.pt... 0: [2022-11-26 00:08:27,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/layer_20-model_00-model_states.pt. 0: [2022-11-26 00:08:27,187] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step10000/mp_rank_00_model_states.pt 0: [2022-11-26 00:08:27,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/mp_rank_00_model_states.pt... 0: [2022-11-26 00:08:27,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/mp_rank_00_model_states.pt. 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 54: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 35: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 19: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 2: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:08:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:08:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:08:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 12: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:08:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 54: [2022-11-26 00:08:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 31: [2022-11-26 00:08:27,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 54: [2022-11-26 00:08:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:08:27,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 00:08:27,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 6: [2022-11-26 00:08:27,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 59: [2022-11-26 00:08:27,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 6: [2022-11-26 00:08:27,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 61: [2022-11-26 00:08:27,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 00:08:27,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:08:27,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 9: [2022-11-26 00:08:27,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:08:27,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 00:08:27,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:08:27,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:08:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 24: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 57: [2022-11-26 00:08:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 18: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:08:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 41: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 57: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 4: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:08:27,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 60: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 16: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 21: [2022-11-26 00:08:27,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 51: [2022-11-26 00:08:27,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 21: [2022-11-26 00:08:27,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 51: [2022-11-26 00:08:27,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 21: [2022-11-26 00:08:27,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 12: [2022-11-26 00:08:27,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:08:27,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:08:27,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:08:27,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:08:27,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 00:08:27,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 29: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 47: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 47: [2022-11-26 00:08:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 29: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:08:27,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:08:27,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 59: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 18: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:08:27,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 61: [2022-11-26 00:08:27,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 15: [2022-11-26 00:08:27,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 61: [2022-11-26 00:08:27,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 15: [2022-11-26 00:08:27,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:08:27,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 9: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 47: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 57: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 8: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 00:08:27,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 41: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 13: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 41: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 13: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 9: [2022-11-26 00:08:27,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 50: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 31: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 39: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 31: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 39: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 31: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:08:27,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 55: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 12: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 12: [2022-11-26 00:08:27,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 49: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 7: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 7: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 00:08:27,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 27: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 46: [2022-11-26 00:08:27,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 20: [2022-11-26 00:08:27,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 38: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 19: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 38: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 40: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 23: [2022-11-26 00:08:27,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 40: [2022-11-26 00:08:27,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 23: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:08:27,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 36: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 2: [2022-11-26 00:08:27,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:08:27,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:08:27,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:08:27,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:08:27,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 31: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 38: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 26: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 50: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:08:27,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:08:27,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:08:27,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 00:08:27,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 27: [2022-11-26 00:08:27,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 44: [2022-11-26 00:08:27,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 44: [2022-11-26 00:08:27,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 27: [2022-11-26 00:08:27,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 17: [2022-11-26 00:08:27,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 42: [2022-11-26 00:08:27,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 17: [2022-11-26 00:08:27,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 42: [2022-11-26 00:08:27,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 21: [2022-11-26 00:08:27,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:08:27,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 21: [2022-11-26 00:08:27,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:08:27,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:08:27,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:08:27,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 00:08:27,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 12: [2022-11-26 00:08:27,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:08:27,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:08:27,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:08:27,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:08:27,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 00:08:27,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:08:27,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 00:08:27,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:08:27,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 00:08:27,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:08:27,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 00:08:27,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:08:27,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:08:27,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 41: [2022-11-26 00:08:27,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 00:08:27,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 00:08:27,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 16: [2022-11-26 00:08:27,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 56: [2022-11-26 00:08:27,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 00:08:27,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:08:27,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 00:08:27,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 28: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 35: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:08:27,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:08:27,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 00:08:27,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:08:27,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 00:08:27,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:08:27,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 13: [2022-11-26 00:08:27,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 3: [2022-11-26 00:08:27,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 45: [2022-11-26 00:08:27,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 00:08:27,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 00:08:27,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 21: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:08:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:08:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 38: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 12: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:08:27,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:08:27,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 00:08:27,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:08:27,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 00:08:27,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:08:27,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:08:27,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 41: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 1: [2022-11-26 00:08:27,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 41: [2022-11-26 00:08:27,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 1: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:08:27,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 00:08:27,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:08:27,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:08:27,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:08:27,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 10: [2022-11-26 00:08:27,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:08:27,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 34: [2022-11-26 00:08:27,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 10: [2022-11-26 00:08:27,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:08:27,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 00:08:27,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:08:27,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 00:08:27,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:08:27,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 00:08:27,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 00:08:27,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:08:27,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 31: [2022-11-26 00:08:27,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 57: [2022-11-26 00:08:27,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 00:08:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 00:08:27,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 12: [2022-11-26 00:08:27,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:08:27,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 00:08:27,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 00:08:27,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 60: [2022-11-26 00:08:27,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 5: [2022-11-26 00:08:27,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 26: [2022-11-26 00:08:27,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 21: [2022-11-26 00:08:27,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:08:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:08:27,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 61: [2022-11-26 00:08:27,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:08:27,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 13: [2022-11-26 00:08:27,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 61: [2022-11-26 00:08:27,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:08:27,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:08:27,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 00:08:27,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:08:27,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 00:08:27,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:08:27,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 00:08:27,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:08:27,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:08:27,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 00:08:27,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 16: [2022-11-26 00:08:27,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 50: [2022-11-26 00:08:27,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:08:27,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:08:27,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 00:08:27,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 00:08:27,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:08:27,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 00:08:27,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 00:08:27,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:08:27,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 00:08:27,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:08:27,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 00:08:27,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 00:08:27,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 00:08:27,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 57: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:08:27,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 00:08:27,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:08:27,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 00:08:27,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:08:27,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 12: [2022-11-26 00:08:27,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:08:27,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:08:27,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 21: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 45: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 00:08:27,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 25: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 43: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 3: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:08:27,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 00:08:27,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:08:27,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 1: [2022-11-26 00:08:27,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 13: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 42: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:08:27,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 9: [2022-11-26 00:08:27,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 42: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:08:27,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 00:08:27,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 00:08:27,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:08:27,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:08:27,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 00:08:27,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:08:27,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:08:27,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 00:08:27,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:08:27,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 00:08:27,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 63: [2022-11-26 00:08:27,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 00:08:27,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 28: [2022-11-26 00:08:27,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 00:08:27,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 00:08:27,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 7: [2022-11-26 00:08:27,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:08:27,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 00:08:27,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 30: [2022-11-26 00:08:27,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:08:27,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 00:08:27,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 62: [2022-11-26 00:08:27,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:08:27,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 00:08:27,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:08:27,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 6: [2022-11-26 00:08:27,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:08:27,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 00:08:27,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:08:27,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 00:08:27,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 13: [2022-11-26 00:08:27,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:08:27,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:08:27,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:08:27,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 00:08:27,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 57: [2022-11-26 00:08:27,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:08:27,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 57: [2022-11-26 00:08:27,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 2: [2022-11-26 00:08:27,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:08:27,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 00:08:27,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 54: [2022-11-26 00:08:27,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:08:27,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 00:08:27,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 1: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 45: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:08:27,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 1: [2022-11-26 00:08:27,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 45: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 29: [2022-11-26 00:08:27,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 11: [2022-11-26 00:08:27,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:08:27,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:08:27,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 3: [2022-11-26 00:08:27,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 21: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 5: [2022-11-26 00:08:27,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 24: [2022-11-26 00:08:27,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 50: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:08:27,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 4: [2022-11-26 00:08:27,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:08:27,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 22: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:08:27,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 10: [2022-11-26 00:08:27,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 41: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:08:27,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 42: [2022-11-26 00:08:27,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 52: [2022-11-26 00:08:27,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:08:27,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 00:08:27,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 19: [2022-11-26 00:08:27,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:08:27,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 00:08:27,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 59: [2022-11-26 00:08:27,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:08:27,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 00:08:27,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 46: [2022-11-26 00:08:27,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 37: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 39: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 39: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 37: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 61: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 12: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 34: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 12: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 34: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 27: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 63: [2022-11-26 00:08:27,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 27: [2022-11-26 00:08:27,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 63: [2022-11-26 00:08:27,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 27: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 48: [2022-11-26 00:08:27,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 17: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:08:27,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 23: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:08:27,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 18: [2022-11-26 00:08:27,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 60: [2022-11-26 00:08:27,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 18: [2022-11-26 00:08:27,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 60: [2022-11-26 00:08:27,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 18: [2022-11-26 00:08:27,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 38: [2022-11-26 00:08:27,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:08:27,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 00:08:27,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 16: [2022-11-26 00:08:27,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:08:27,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 43: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 9: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 31: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 28: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 31: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 3: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 32: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 3: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 32: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 58: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 36: [2022-11-26 00:08:27,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:08:27,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 00:08:27,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 9: [2022-11-26 00:08:27,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 35: [2022-11-26 00:08:27,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 9: [2022-11-26 00:08:27,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 35: [2022-11-26 00:08:27,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 47: [2022-11-26 00:08:27,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:08:27,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 00:08:27,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:08:27,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 00:08:27,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 20: [2022-11-26 00:08:27,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 51: [2022-11-26 00:08:27,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:08:27,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 00:08:27,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 49: [2022-11-26 00:08:27,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:08:27,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 00:08:27,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 44: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 14: [2022-11-26 00:08:27,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 44: [2022-11-26 00:08:27,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 14: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 26: [2022-11-26 00:08:27,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:08:27,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 00:08:27,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 33: [2022-11-26 00:08:27,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:08:27,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 00:08:27,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:08:27,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 00:08:27,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 55: [2022-11-26 00:08:27,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:08:27,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 00:08:27,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:08:27,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 00:08:27,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 00:08:27,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 40: [2022-11-26 00:08:27,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 60: [2022-11-26 00:08:27,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:08:27,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 00:08:27,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 14: [2022-11-26 00:08:27,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:08:27,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 00:08:27,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 25: [2022-11-26 00:08:27,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:08:27,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 00:08:27,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 44: [2022-11-26 00:08:27,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:08:27,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 00:08:27,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 8: [2022-11-26 00:08:27,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:08:27,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 00:08:27,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 56: [2022-11-26 00:08:27,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:08:27,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 00:08:27,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 53: [2022-11-26 00:08:27,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:08:27,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 00:08:27,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 15: [2022-11-26 00:08:27,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:08:27,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step10000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 00:08:27,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! 0: successfully saved checkpoint at iteration 10000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5276.82 63: iteration 10010/ 24424 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 2.83 | learning rate: 1.368E-04 | global batch size: 512 | lm loss: 2.131463E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.886 | TFLOPs: 18.62 | 63: iteration 10020/ 24424 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 2.24 | learning rate: 1.367E-04 | global batch size: 512 | lm loss: 2.146379E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.525 | TFLOPs: 23.53 | 63: iteration 10030/ 24424 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 2.23 | learning rate: 1.365E-04 | global batch size: 512 | lm loss: 2.135341E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.554 | TFLOPs: 23.63 | 63: iteration 10040/ 24424 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 2.23 | learning rate: 1.364E-04 | global batch size: 512 | lm loss: 2.118500E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.247 | TFLOPs: 23.60 | 63: iteration 10050/ 24424 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 2.23 | learning rate: 1.363E-04 | global batch size: 512 | lm loss: 2.127018E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.095 | TFLOPs: 23.58 | 63: iteration 10060/ 24424 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 2.26 | learning rate: 1.362E-04 | global batch size: 512 | lm loss: 2.118567E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.910 | TFLOPs: 23.36 | 63: iteration 10070/ 24424 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 2.24 | learning rate: 1.361E-04 | global batch size: 512 | lm loss: 2.131627E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.427 | TFLOPs: 23.52 | 63: iteration 10080/ 24424 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 2.26 | learning rate: 1.360E-04 | global batch size: 512 | lm loss: 2.159729E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.438 | TFLOPs: 23.31 | 63: iteration 10090/ 24424 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 2.24 | learning rate: 1.359E-04 | global batch size: 512 | lm loss: 2.135749E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.611 | TFLOPs: 23.53 | 63: iteration 10100/ 24424 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 2.23 | learning rate: 1.358E-04 | global batch size: 512 | lm loss: 2.140954E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.376 | TFLOPs: 23.61 | 63: iteration 10110/ 24424 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 2.23 | learning rate: 1.357E-04 | global batch size: 512 | lm loss: 2.105318E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.926 | TFLOPs: 23.67 | 63: iteration 10120/ 24424 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 2.23 | learning rate: 1.355E-04 | global batch size: 512 | lm loss: 2.132844E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.678 | TFLOPs: 23.64 | 63: iteration 10130/ 24424 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 2.25 | learning rate: 1.354E-04 | global batch size: 512 | lm loss: 2.124015E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.975 | TFLOPs: 23.47 | 63: iteration 10140/ 24424 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 2.28 | learning rate: 1.353E-04 | global batch size: 512 | lm loss: 2.134915E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.361 | TFLOPs: 23.10 | 63: iteration 10150/ 24424 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 2.25 | learning rate: 1.352E-04 | global batch size: 512 | lm loss: 2.143896E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.634 | TFLOPs: 23.43 | 63: iteration 10160/ 24424 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 2.24 | learning rate: 1.351E-04 | global batch size: 512 | lm loss: 2.127850E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.433 | TFLOPs: 23.52 | 63: iteration 10170/ 24424 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 2.26 | learning rate: 1.350E-04 | global batch size: 512 | lm loss: 2.138520E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.584 | TFLOPs: 23.33 | 63: iteration 10180/ 24424 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 2.23 | learning rate: 1.349E-04 | global batch size: 512 | lm loss: 2.118677E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.297 | TFLOPs: 23.61 | 63: iteration 10190/ 24424 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 2.24 | learning rate: 1.348E-04 | global batch size: 512 | lm loss: 2.125653E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.201 | TFLOPs: 23.49 | 63: iteration 10200/ 24424 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 2.23 | learning rate: 1.346E-04 | global batch size: 512 | lm loss: 2.134574E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.808 | TFLOPs: 23.66 | 63: iteration 10210/ 24424 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 2.25 | learning rate: 1.345E-04 | global batch size: 512 | lm loss: 2.131487E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.354 | TFLOPs: 23.41 | 63: iteration 10220/ 24424 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 2.24 | learning rate: 1.344E-04 | global batch size: 512 | lm loss: 2.139023E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.300 | TFLOPs: 23.50 | 63: iteration 10230/ 24424 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 2.26 | learning rate: 1.343E-04 | global batch size: 512 | lm loss: 2.126874E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.242 | TFLOPs: 23.29 | 63: iteration 10240/ 24424 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 2.24 | learning rate: 1.342E-04 | global batch size: 512 | lm loss: 2.132442E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.126 | TFLOPs: 23.48 | 63: iteration 10250/ 24424 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 2.24 | learning rate: 1.341E-04 | global batch size: 512 | lm loss: 2.135910E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.371 | TFLOPs: 23.51 | 63: iteration 10260/ 24424 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 2.24 | learning rate: 1.340E-04 | global batch size: 512 | lm loss: 2.154789E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.379 | TFLOPs: 23.51 | 63: iteration 10270/ 24424 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 2.23 | learning rate: 1.339E-04 | global batch size: 512 | lm loss: 2.129014E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.395 | TFLOPs: 23.62 | 63: iteration 10280/ 24424 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 2.50 | learning rate: 1.337E-04 | global batch size: 512 | lm loss: 2.139663E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 204.500 | TFLOPs: 21.05 | 63: iteration 10290/ 24424 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 2.25 | learning rate: 1.336E-04 | global batch size: 512 | lm loss: 2.114503E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.056 | TFLOPs: 23.37 | 63: iteration 10300/ 24424 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 2.24 | learning rate: 1.335E-04 | global batch size: 512 | lm loss: 2.124001E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.921 | TFLOPs: 23.57 | 63: iteration 10310/ 24424 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 2.23 | learning rate: 1.334E-04 | global batch size: 512 | lm loss: 2.113978E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.838 | TFLOPs: 23.66 | 63: iteration 10320/ 24424 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 2.28 | learning rate: 1.333E-04 | global batch size: 512 | lm loss: 2.147663E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.589 | TFLOPs: 23.12 | 63: iteration 10330/ 24424 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 2.30 | learning rate: 1.332E-04 | global batch size: 512 | lm loss: 2.143400E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.181 | TFLOPs: 22.87 | 63: iteration 10340/ 24424 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 2.25 | learning rate: 1.331E-04 | global batch size: 512 | lm loss: 2.124891E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.746 | TFLOPs: 23.45 | 63: iteration 10350/ 24424 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 2.24 | learning rate: 1.329E-04 | global batch size: 512 | lm loss: 2.132751E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.721 | TFLOPs: 23.55 | 63: iteration 10360/ 24424 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 2.25 | learning rate: 1.328E-04 | global batch size: 512 | lm loss: 2.115382E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.300 | TFLOPs: 23.40 | 63: iteration 10370/ 24424 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 2.24 | learning rate: 1.327E-04 | global batch size: 512 | lm loss: 2.124213E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.959 | TFLOPs: 23.57 | 63: iteration 10380/ 24424 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 2.23 | learning rate: 1.326E-04 | global batch size: 512 | lm loss: 2.133962E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.407 | TFLOPs: 23.62 | 63: iteration 10390/ 24424 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 2.24 | learning rate: 1.325E-04 | global batch size: 512 | lm loss: 2.128731E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.456 | TFLOPs: 23.52 | 63: iteration 10400/ 24424 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 2.23 | learning rate: 1.324E-04 | global batch size: 512 | lm loss: 2.142790E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.742 | TFLOPs: 23.65 | 63: iteration 10410/ 24424 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 2.25 | learning rate: 1.323E-04 | global batch size: 512 | lm loss: 2.129838E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.767 | TFLOPs: 23.45 | 63: iteration 10420/ 24424 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 2.25 | learning rate: 1.322E-04 | global batch size: 512 | lm loss: 2.109153E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.858 | TFLOPs: 23.46 | 63: iteration 10430/ 24424 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 2.24 | learning rate: 1.320E-04 | global batch size: 512 | lm loss: 2.132504E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.547 | TFLOPs: 23.53 | 63: iteration 10440/ 24424 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 2.40 | learning rate: 1.319E-04 | global batch size: 512 | lm loss: 2.139696E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 213.211 | TFLOPs: 21.95 | 63: iteration 10450/ 24424 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 2.25 | learning rate: 1.318E-04 | global batch size: 512 | lm loss: 2.117859E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.592 | TFLOPs: 23.43 | 63: iteration 10460/ 24424 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 2.64 | learning rate: 1.317E-04 | global batch size: 512 | lm loss: 2.119409E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 193.732 | TFLOPs: 19.94 | 63: iteration 10470/ 24424 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 2.27 | learning rate: 1.316E-04 | global batch size: 512 | lm loss: 2.113634E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.547 | TFLOPs: 23.22 | 63: iteration 10480/ 24424 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 2.31 | learning rate: 1.315E-04 | global batch size: 512 | lm loss: 2.117959E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.999 | TFLOPs: 22.85 | 63: iteration 10490/ 24424 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 2.25 | learning rate: 1.314E-04 | global batch size: 512 | lm loss: 2.108253E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.965 | TFLOPs: 23.47 | 63: iteration 10500/ 24424 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 2.27 | learning rate: 1.312E-04 | global batch size: 512 | lm loss: 2.132131E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.537 | TFLOPs: 23.22 | 63: iteration 10510/ 24424 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 2.24 | learning rate: 1.311E-04 | global batch size: 512 | lm loss: 2.140149E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.424 | TFLOPs: 23.52 | 63: iteration 10520/ 24424 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 2.24 | learning rate: 1.310E-04 | global batch size: 512 | lm loss: 2.132079E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.364 | TFLOPs: 23.51 | 63: iteration 10530/ 24424 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 2.24 | learning rate: 1.309E-04 | global batch size: 512 | lm loss: 2.141649E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.952 | TFLOPs: 23.57 | 63: iteration 10540/ 24424 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 2.28 | learning rate: 1.308E-04 | global batch size: 512 | lm loss: 2.108137E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.017 | TFLOPs: 23.16 | 63: iteration 10550/ 24424 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 2.27 | learning rate: 1.307E-04 | global batch size: 512 | lm loss: 2.111655E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.444 | TFLOPs: 23.21 | 63: iteration 10560/ 24424 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 2.24 | learning rate: 1.306E-04 | global batch size: 512 | lm loss: 2.147791E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.030 | TFLOPs: 23.58 | 63: iteration 10570/ 24424 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 2.25 | learning rate: 1.305E-04 | global batch size: 512 | lm loss: 2.118421E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.743 | TFLOPs: 23.45 | 63: iteration 10580/ 24424 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 2.23 | learning rate: 1.303E-04 | global batch size: 512 | lm loss: 2.137321E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.203 | TFLOPs: 23.60 | 63: iteration 10590/ 24424 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 2.24 | learning rate: 1.302E-04 | global batch size: 512 | lm loss: 2.111591E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.449 | TFLOPs: 23.52 | 63: iteration 10600/ 24424 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 2.23 | learning rate: 1.301E-04 | global batch size: 512 | lm loss: 2.120904E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.003 | TFLOPs: 23.68 | 63: iteration 10610/ 24424 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 3.56 | learning rate: 1.300E-04 | global batch size: 512 | lm loss: 2.113796E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 143.748 | TFLOPs: 14.80 | 63: iteration 10620/ 24424 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 2.25 | learning rate: 1.299E-04 | global batch size: 512 | lm loss: 2.096241E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.654 | TFLOPs: 23.44 | 63: iteration 10630/ 24424 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 4.10 | learning rate: 1.298E-04 | global batch size: 512 | lm loss: 2.140622E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 124.813 | TFLOPs: 12.85 | 63: iteration 10640/ 24424 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 2.23 | learning rate: 1.297E-04 | global batch size: 512 | lm loss: 2.114184E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.929 | TFLOPs: 23.67 | 63: iteration 10650/ 24424 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 2.23 | learning rate: 1.295E-04 | global batch size: 512 | lm loss: 2.140112E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.369 | TFLOPs: 23.61 | 63: iteration 10660/ 24424 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 2.26 | learning rate: 1.294E-04 | global batch size: 512 | lm loss: 2.119394E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.828 | TFLOPs: 23.35 | 63: iteration 10670/ 24424 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 2.25 | learning rate: 1.293E-04 | global batch size: 512 | lm loss: 2.112152E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.975 | TFLOPs: 23.47 | 63: iteration 10680/ 24424 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 2.24 | learning rate: 1.292E-04 | global batch size: 512 | lm loss: 2.141034E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.710 | TFLOPs: 23.54 | 63: iteration 10690/ 24424 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 2.25 | learning rate: 1.291E-04 | global batch size: 512 | lm loss: 2.118661E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.098 | TFLOPs: 23.38 | 63: iteration 10700/ 24424 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 2.33 | learning rate: 1.290E-04 | global batch size: 512 | lm loss: 2.739753E+00 | grad norm: 23.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.415 | TFLOPs: 22.59 | 63: iteration 10710/ 24424 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 2.31 | learning rate: 1.289E-04 | global batch size: 512 | lm loss: 8.998993E+00 | grad norm: 4.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.103 | TFLOPs: 22.86 | 63: iteration 10720/ 24424 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 2.31 | learning rate: 1.287E-04 | global batch size: 512 | lm loss: 7.903074E+00 | grad norm: 2.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.942 | TFLOPs: 22.85 | 63: iteration 10730/ 24424 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 2.30 | learning rate: 1.286E-04 | global batch size: 512 | lm loss: 7.250848E+00 | grad norm: 2.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.511 | TFLOPs: 22.91 | 63: iteration 10740/ 24424 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 2.27 | learning rate: 1.285E-04 | global batch size: 512 | lm loss: 6.786937E+00 | grad norm: 2.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.747 | TFLOPs: 23.24 | 63: iteration 10750/ 24424 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 2.30 | learning rate: 1.284E-04 | global batch size: 512 | lm loss: 6.190281E+00 | grad norm: 0.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.934 | TFLOPs: 22.95 | 63: iteration 10760/ 24424 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 2.26 | learning rate: 1.283E-04 | global batch size: 512 | lm loss: 5.627465E+00 | grad norm: 1.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.241 | TFLOPs: 23.29 | 63: iteration 10770/ 24424 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 2.28 | learning rate: 1.282E-04 | global batch size: 512 | lm loss: 4.677119E+00 | grad norm: 1.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.896 | TFLOPs: 23.15 | 63: iteration 10780/ 24424 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 2.26 | learning rate: 1.281E-04 | global batch size: 512 | lm loss: 3.559797E+00 | grad norm: 2.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.523 | TFLOPs: 23.32 | 63: iteration 10790/ 24424 | consumed samples: 5524480 | consumed tokens: 11314135040 | elapsed time per iteration (s): 2.29 | learning rate: 1.279E-04 | global batch size: 512 | lm loss: 2.922598E+00 | grad norm: 1.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.162 | TFLOPs: 22.97 | 63: iteration 10800/ 24424 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 2.30 | learning rate: 1.278E-04 | global batch size: 512 | lm loss: 2.594721E+00 | grad norm: 1.068 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.029 | TFLOPs: 22.96 | 63: iteration 10810/ 24424 | consumed samples: 5534720 | consumed tokens: 11335106560 | elapsed time per iteration (s): 2.25 | learning rate: 1.277E-04 | global batch size: 512 | lm loss: 2.436763E+00 | grad norm: 0.998 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.535 | TFLOPs: 23.42 | 63: iteration 10820/ 24424 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 2.27 | learning rate: 1.276E-04 | global batch size: 512 | lm loss: 2.332350E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.307 | TFLOPs: 23.19 | 63: iteration 10830/ 24424 | consumed samples: 5544960 | consumed tokens: 11356078080 | elapsed time per iteration (s): 2.24 | learning rate: 1.275E-04 | global batch size: 512 | lm loss: 2.244802E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.694 | TFLOPs: 23.54 | 63: iteration 10840/ 24424 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 2.24 | learning rate: 1.274E-04 | global batch size: 512 | lm loss: 2.202619E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.714 | TFLOPs: 23.54 | 63: iteration 10850/ 24424 | consumed samples: 5555200 | consumed tokens: 11377049600 | elapsed time per iteration (s): 2.28 | learning rate: 1.273E-04 | global batch size: 512 | lm loss: 2.167307E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.999 | TFLOPs: 23.16 | 63: iteration 10860/ 24424 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 2.23 | learning rate: 1.271E-04 | global batch size: 512 | lm loss: 2.170948E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.113 | TFLOPs: 23.59 | 63: iteration 10870/ 24424 | consumed samples: 5565440 | consumed tokens: 11398021120 | elapsed time per iteration (s): 2.24 | learning rate: 1.270E-04 | global batch size: 512 | lm loss: 2.149598E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.710 | TFLOPs: 23.54 | 63: iteration 10880/ 24424 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 2.24 | learning rate: 1.269E-04 | global batch size: 512 | lm loss: 2.167288E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.057 | TFLOPs: 23.58 | 63: iteration 10890/ 24424 | consumed samples: 5575680 | consumed tokens: 11418992640 | elapsed time per iteration (s): 2.24 | learning rate: 1.268E-04 | global batch size: 512 | lm loss: 2.148366E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.008 | TFLOPs: 23.58 | 63: iteration 10900/ 24424 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 2.24 | learning rate: 1.267E-04 | global batch size: 512 | lm loss: 2.158187E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.551 | TFLOPs: 23.53 | 63: iteration 10910/ 24424 | consumed samples: 5585920 | consumed tokens: 11439964160 | elapsed time per iteration (s): 2.24 | learning rate: 1.266E-04 | global batch size: 512 | lm loss: 2.147504E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.646 | TFLOPs: 23.54 | 63: iteration 10920/ 24424 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 2.27 | learning rate: 1.264E-04 | global batch size: 512 | lm loss: 2.158587E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.549 | TFLOPs: 23.22 | 63: iteration 10930/ 24424 | consumed samples: 5596160 | consumed tokens: 11460935680 | elapsed time per iteration (s): 2.23 | learning rate: 1.263E-04 | global batch size: 512 | lm loss: 2.172119E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.369 | TFLOPs: 23.61 | 63: iteration 10940/ 24424 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 4.09 | learning rate: 1.262E-04 | global batch size: 512 | lm loss: 2.145639E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 125.088 | TFLOPs: 12.88 | 63: iteration 10950/ 24424 | consumed samples: 5606400 | consumed tokens: 11481907200 | elapsed time per iteration (s): 2.25 | learning rate: 1.261E-04 | global batch size: 512 | lm loss: 2.140955E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.134 | TFLOPs: 23.38 | 63: iteration 10960/ 24424 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 2.53 | learning rate: 1.260E-04 | global batch size: 512 | lm loss: 2.137135E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 202.596 | TFLOPs: 20.86 | 63: iteration 10970/ 24424 | consumed samples: 5616640 | consumed tokens: 11502878720 | elapsed time per iteration (s): 2.23 | learning rate: 1.259E-04 | global batch size: 512 | lm loss: 2.146110E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.627 | TFLOPs: 23.64 | 63: iteration 10980/ 24424 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 2.24 | learning rate: 1.258E-04 | global batch size: 512 | lm loss: 2.156318E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.834 | TFLOPs: 23.56 | 63: iteration 10990/ 24424 | consumed samples: 5626880 | consumed tokens: 11523850240 | elapsed time per iteration (s): 2.34 | learning rate: 1.256E-04 | global batch size: 512 | lm loss: 2.139491E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.040 | TFLOPs: 22.55 | 63: iteration 11000/ 24424 | consumed samples: 5632000 | consumed tokens: 11534336000 | elapsed time per iteration (s): 2.25 | learning rate: 1.255E-04 | global batch size: 512 | lm loss: 2.148319E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.844 | TFLOPs: 23.46 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 11000 | lm loss value: 2.103703E+00 | lm loss PPL: 8.196467E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 11000 to checkpoints_3b9 0: [2022-11-26 00:47:01,767] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step11000 is begin to save! 0: [2022-11-26 00:47:01,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_01-model_00-model_states.pt... 32: [2022-11-26 00:47:01,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_21-model_00-model_states.pt... 32: [2022-11-26 00:47:02,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_21-model_00-model_states.pt. 32: [2022-11-26 00:47:02,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_22-model_00-model_states.pt... 0: [2022-11-26 00:47:02,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_01-model_00-model_states.pt. 0: [2022-11-26 00:47:02,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_03-model_00-model_states.pt... 32: [2022-11-26 00:47:02,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_22-model_00-model_states.pt. 32: [2022-11-26 00:47:02,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_23-model_00-model_states.pt... 0: [2022-11-26 00:47:02,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_03-model_00-model_states.pt. 0: [2022-11-26 00:47:02,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_04-model_00-model_states.pt... 32: [2022-11-26 00:47:02,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_23-model_00-model_states.pt. 32: [2022-11-26 00:47:02,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_24-model_00-model_states.pt... 0: [2022-11-26 00:47:02,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_04-model_00-model_states.pt. 0: [2022-11-26 00:47:02,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_05-model_00-model_states.pt... 32: [2022-11-26 00:47:02,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_24-model_00-model_states.pt. 32: [2022-11-26 00:47:02,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_25-model_00-model_states.pt... 0: [2022-11-26 00:47:02,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_05-model_00-model_states.pt. 0: [2022-11-26 00:47:02,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_06-model_00-model_states.pt... 32: [2022-11-26 00:47:03,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_25-model_00-model_states.pt. 32: [2022-11-26 00:47:03,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_26-model_00-model_states.pt... 0: [2022-11-26 00:47:03,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_06-model_00-model_states.pt. 0: [2022-11-26 00:47:03,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_07-model_00-model_states.pt... 32: [2022-11-26 00:47:03,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_26-model_00-model_states.pt. 32: [2022-11-26 00:47:03,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_27-model_00-model_states.pt... 0: [2022-11-26 00:47:03,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_07-model_00-model_states.pt. 0: [2022-11-26 00:47:03,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_08-model_00-model_states.pt... 32: [2022-11-26 00:47:03,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_27-model_00-model_states.pt. 32: [2022-11-26 00:47:03,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_28-model_00-model_states.pt... 0: [2022-11-26 00:47:03,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_08-model_00-model_states.pt. 0: [2022-11-26 00:47:03,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_09-model_00-model_states.pt... 32: [2022-11-26 00:47:03,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_28-model_00-model_states.pt. 32: [2022-11-26 00:47:03,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_29-model_00-model_states.pt... 0: [2022-11-26 00:47:03,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_09-model_00-model_states.pt. 0: [2022-11-26 00:47:03,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_10-model_00-model_states.pt... 32: [2022-11-26 00:47:03,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_29-model_00-model_states.pt. 32: [2022-11-26 00:47:03,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_30-model_00-model_states.pt... 0: [2022-11-26 00:47:04,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_10-model_00-model_states.pt. 0: [2022-11-26 00:47:04,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_11-model_00-model_states.pt... 32: [2022-11-26 00:47:04,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_30-model_00-model_states.pt. 32: [2022-11-26 00:47:04,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_31-model_00-model_states.pt... 0: [2022-11-26 00:47:04,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_11-model_00-model_states.pt. 0: [2022-11-26 00:47:04,226] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_12-model_00-model_states.pt... 0: [2022-11-26 00:47:04,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_12-model_00-model_states.pt. 32: [2022-11-26 00:47:04,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_31-model_00-model_states.pt. 0: [2022-11-26 00:47:04,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_13-model_00-model_states.pt... 32: [2022-11-26 00:47:04,445] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_32-model_00-model_states.pt... 0: [2022-11-26 00:47:04,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_13-model_00-model_states.pt. 0: [2022-11-26 00:47:04,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_14-model_00-model_states.pt... 32: [2022-11-26 00:47:04,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_32-model_00-model_states.pt. 32: [2022-11-26 00:47:04,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_33-model_00-model_states.pt... 0: [2022-11-26 00:47:04,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_14-model_00-model_states.pt. 0: [2022-11-26 00:47:04,887] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_15-model_00-model_states.pt... 32: [2022-11-26 00:47:04,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_33-model_00-model_states.pt. 32: [2022-11-26 00:47:04,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_34-model_00-model_states.pt... 0: [2022-11-26 00:47:05,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_15-model_00-model_states.pt. 0: [2022-11-26 00:47:05,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_16-model_00-model_states.pt... 32: [2022-11-26 00:47:05,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_34-model_00-model_states.pt. 32: [2022-11-26 00:47:05,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_35-model_00-model_states.pt... 0: [2022-11-26 00:47:05,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_16-model_00-model_states.pt. 0: [2022-11-26 00:47:05,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_17-model_00-model_states.pt... 32: [2022-11-26 00:47:05,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_35-model_00-model_states.pt. 32: [2022-11-26 00:47:05,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_36-model_00-model_states.pt... 0: [2022-11-26 00:47:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_17-model_00-model_states.pt. 0: [2022-11-26 00:47:05,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_18-model_00-model_states.pt... 32: [2022-11-26 00:47:05,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_36-model_00-model_states.pt. 32: [2022-11-26 00:47:05,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_37-model_00-model_states.pt... 0: [2022-11-26 00:47:05,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_18-model_00-model_states.pt. 0: [2022-11-26 00:47:05,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_19-model_00-model_states.pt... 32: [2022-11-26 00:47:05,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_37-model_00-model_states.pt. 32: [2022-11-26 00:47:05,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_38-model_00-model_states.pt... 0: [2022-11-26 00:47:05,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_19-model_00-model_states.pt. 0: [2022-11-26 00:47:05,991] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_20-model_00-model_states.pt... 32: [2022-11-26 00:47:06,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_38-model_00-model_states.pt. 32: [2022-11-26 00:47:06,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/layer_40-model_00-model_states.pt... 32: [2022-11-26 00:47:06,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_40-model_00-model_states.pt. 32: [2022-11-26 00:47:06,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/mp_rank_01_model_states.pt... 32: [2022-11-26 00:47:06,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/mp_rank_01_model_states.pt. 0: [2022-11-26 00:47:06,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/layer_20-model_00-model_states.pt. 0: [2022-11-26 00:47:06,210] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step11000/mp_rank_00_model_states.pt 0: [2022-11-26 00:47:06,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/mp_rank_00_model_states.pt... 0: [2022-11-26 00:47:06,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/mp_rank_00_model_states.pt. 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 53: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 63: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 19: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 52: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 33: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 35: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 44: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 48: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 37: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 43: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 39: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 4: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 9: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 7: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 30: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 28: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 13: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 6: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 16: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 31: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 26: [2022-11-26 00:47:06,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step11000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 0: [2022-11-26 00:47:06,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 28: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 55: [2022-11-26 00:47:06,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 28: [2022-11-26 00:47:06,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 55: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:47:06,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 10: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 58: [2022-11-26 00:47:06,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 10: [2022-11-26 00:47:06,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 58: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 53: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 12: [2022-11-26 00:47:06,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 53: [2022-11-26 00:47:06,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 12: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:47:06,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 00:47:06,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:47:06,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:47:06,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 42: [2022-11-26 00:47:06,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 9: [2022-11-26 00:47:06,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 55: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 50: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 16: [2022-11-26 00:47:06,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 50: [2022-11-26 00:47:06,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 16: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:47:06,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 50: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 35: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 30: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 44: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 30: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 44: [2022-11-26 00:47:06,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:47:06,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 53: [2022-11-26 00:47:06,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 28: [2022-11-26 00:47:06,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 53: [2022-11-26 00:47:06,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 28: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:47:06,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 23: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 52: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 5: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 32: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 5: [2022-11-26 00:47:06,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 32: [2022-11-26 00:47:06,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 11: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 57: [2022-11-26 00:47:06,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:47:06,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:47:06,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:47:06,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:47:06,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 00:47:06,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 23: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 42: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:47:06,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:47:06,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 00:47:06,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 15: [2022-11-26 00:47:06,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 59: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 19: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:47:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 9: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 48: [2022-11-26 00:47:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 51: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 27: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 34: [2022-11-26 00:47:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 51: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 1: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:47:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:47:06,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 35: [2022-11-26 00:47:06,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 53: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 18: [2022-11-26 00:47:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 53: [2022-11-26 00:47:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 18: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:47:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:47:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 52: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 8: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:47:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 16: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 49: [2022-11-26 00:47:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 52: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 49: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:47:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 00:47:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:47:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 22: [2022-11-26 00:47:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:47:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:47:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 41: [2022-11-26 00:47:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:47:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:47:06,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 00:47:06,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:47:06,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 00:47:06,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 38: [2022-11-26 00:47:06,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 38: [2022-11-26 00:47:06,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 38: [2022-11-26 00:47:06,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 41: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 15: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:47:06,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:47:06,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:47:06,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:47:06,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:47:06,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 10: [2022-11-26 00:47:06,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:47:06,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 00:47:06,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 38: [2022-11-26 00:47:06,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:47:06,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 00:47:06,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 44: [2022-11-26 00:47:06,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:47:06,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:47:06,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:47:06,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:47:06,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:47:06,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:47:06,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:47:06,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:47:06,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 00:47:06,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 50: [2022-11-26 00:47:06,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 7: [2022-11-26 00:47:06,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 00:47:06,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 00:47:06,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:47:06,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 00:47:06,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:47:06,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 00:47:06,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:47:06,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:47:06,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 15: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:47:06,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 00:47:06,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 00:47:06,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:47:06,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 00:47:06,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:47:06,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 00:47:06,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:47:06,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 22: [2022-11-26 00:47:06,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:47:06,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:47:06,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:47:06,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:47:06,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 54: [2022-11-26 00:47:06,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 26: [2022-11-26 00:47:06,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 44: [2022-11-26 00:47:06,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 41: [2022-11-26 00:47:06,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:47:06,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:47:06,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 58: [2022-11-26 00:47:06,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 9: [2022-11-26 00:47:06,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:47:06,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 00:47:06,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:47:06,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 00:47:06,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 00:47:06,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:47:06,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 00:47:06,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:47:06,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:47:06,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:47:06,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 00:47:06,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 15: [2022-11-26 00:47:06,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:47:06,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 00:47:06,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 00:47:06,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 55: [2022-11-26 00:47:06,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:47:06,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 14: [2022-11-26 00:47:06,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 55: [2022-11-26 00:47:06,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:47:06,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 00:47:06,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 00:47:06,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:47:06,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:47:06,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 00:47:06,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:47:06,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:47:06,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:47:06,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 13: [2022-11-26 00:47:06,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 48: [2022-11-26 00:47:06,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 43: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 10: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:47:06,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 63: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 16: [2022-11-26 00:47:06,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:47:06,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 00:47:06,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:47:06,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 00:47:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:47:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 00:47:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:47:06,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 00:47:06,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 35: [2022-11-26 00:47:06,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 0: [2022-11-26 00:47:06,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 35: [2022-11-26 00:47:06,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 0: [2022-11-26 00:47:06,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 44: [2022-11-26 00:47:06,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:47:06,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 00:47:06,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:47:06,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:47:06,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:47:06,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 00:47:06,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 00:47:06,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:47:06,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:47:06,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 15: [2022-11-26 00:47:06,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 61: [2022-11-26 00:47:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 4: [2022-11-26 00:47:06,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 61: [2022-11-26 00:47:06,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 4: [2022-11-26 00:47:06,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 38: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 41: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:47:06,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 32: [2022-11-26 00:47:06,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 19: [2022-11-26 00:47:06,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 34: [2022-11-26 00:47:06,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 19: [2022-11-26 00:47:06,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 00:47:06,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:47:06,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 31: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 55: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 51: [2022-11-26 00:47:06,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:47:06,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:47:06,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 19: [2022-11-26 00:47:06,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 21: [2022-11-26 00:47:06,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 52: [2022-11-26 00:47:06,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:47:06,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:47:06,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 16: [2022-11-26 00:47:06,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 60: [2022-11-26 00:47:06,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 2: [2022-11-26 00:47:06,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:47:06,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:47:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:47:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 11: [2022-11-26 00:47:06,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:47:06,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 25: [2022-11-26 00:47:06,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:47:06,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 33: [2022-11-26 00:47:06,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 1: [2022-11-26 00:47:06,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 35: [2022-11-26 00:47:06,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 22: [2022-11-26 00:47:06,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:47:06,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 6: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 32: [2022-11-26 00:47:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:47:06,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:47:06,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 37: [2022-11-26 00:47:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 39: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 55: [2022-11-26 00:47:06,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 53: [2022-11-26 00:47:06,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 19: [2022-11-26 00:47:06,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 16: [2022-11-26 00:47:06,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 60: [2022-11-26 00:47:06,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 2: [2022-11-26 00:47:06,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 14: [2022-11-26 00:47:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 10: [2022-11-26 00:47:06,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 8: [2022-11-26 00:47:06,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 11: [2022-11-26 00:47:06,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 33: [2022-11-26 00:47:06,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 1: [2022-11-26 00:47:06,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 35: [2022-11-26 00:47:06,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 22: [2022-11-26 00:47:06,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 30: [2022-11-26 00:47:06,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 28: [2022-11-26 00:47:06,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 34: [2022-11-26 00:47:06,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 40: [2022-11-26 00:47:06,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 6: [2022-11-26 00:47:06,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 32: [2022-11-26 00:47:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 42: [2022-11-26 00:47:06,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 39: [2022-11-26 00:47:06,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 21: [2022-11-26 00:47:06,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 34: [2022-11-26 00:47:06,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 44: [2022-11-26 00:47:06,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 41: [2022-11-26 00:47:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 52: [2022-11-26 00:47:06,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:47:06,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 14: [2022-11-26 00:47:06,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 00:47:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 33: [2022-11-26 00:47:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 1: [2022-11-26 00:47:06,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:47:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 14: [2022-11-26 00:47:06,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 8: [2022-11-26 00:47:06,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 33: [2022-11-26 00:47:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 1: [2022-11-26 00:47:06,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 40: [2022-11-26 00:47:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 00:47:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 00:47:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 61: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 61: [2022-11-26 00:47:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 00:47:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 00:47:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 48: [2022-11-26 00:47:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 13: [2022-11-26 00:47:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 12: [2022-11-26 00:47:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 00:47:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 00:47:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 2: [2022-11-26 00:47:06,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 00:47:06,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 00:47:06,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:47:06,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 00:47:06,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 26: [2022-11-26 00:47:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 00:47:06,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 00:47:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 37: [2022-11-26 00:47:06,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 25: [2022-11-26 00:47:06,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 37: [2022-11-26 00:47:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 00:47:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 25: [2022-11-26 00:47:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 00:47:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 28: [2022-11-26 00:47:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 00:47:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 00:47:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 00:47:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 9: [2022-11-26 00:47:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 00:47:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 00:47:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 44: [2022-11-26 00:47:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 00:47:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 41: [2022-11-26 00:47:06,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 00:47:06,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:47:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 00:47:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 00:47:06,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 47: [2022-11-26 00:47:06,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 54: [2022-11-26 00:47:06,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 60: [2022-11-26 00:47:06,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 47: [2022-11-26 00:47:06,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 00:47:06,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 60: [2022-11-26 00:47:06,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 00:47:06,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 00:47:06,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 00:47:06,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 00:47:06,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 38: [2022-11-26 00:47:06,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 00:47:06,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 00:47:06,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 23: [2022-11-26 00:47:06,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 00:47:06,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 00:47:06,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 15: [2022-11-26 00:47:06,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 32: [2022-11-26 00:47:06,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 11: [2022-11-26 00:47:06,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 32: [2022-11-26 00:47:06,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 11: [2022-11-26 00:47:06,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 32: [2022-11-26 00:47:06,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 11: [2022-11-26 00:47:06,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 62: [2022-11-26 00:47:06,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 00:47:06,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 00:47:06,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 55: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 00:47:06,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 39: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 29: [2022-11-26 00:47:06,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 39: [2022-11-26 00:47:06,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 29: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 39: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 17: [2022-11-26 00:47:06,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 00:47:06,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 00:47:06,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 6: [2022-11-26 00:47:06,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 00:47:06,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 00:47:06,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 8: [2022-11-26 00:47:06,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 00:47:06,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 00:47:06,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 31: [2022-11-26 00:47:06,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 19: [2022-11-26 00:47:06,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 34: [2022-11-26 00:47:06,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 19: [2022-11-26 00:47:06,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 34: [2022-11-26 00:47:06,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 19: [2022-11-26 00:47:06,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 50: [2022-11-26 00:47:06,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 00:47:06,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 00:47:06,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 24: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 52: [2022-11-26 00:47:06,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 24: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 52: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 27: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 27: [2022-11-26 00:47:06,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 00:47:06,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 30: [2022-11-26 00:47:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 30: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 36: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 00:47:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 10: [2022-11-26 00:47:06,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 00:47:06,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 00:47:06,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 5: [2022-11-26 00:47:06,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 00:47:06,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 00:47:06,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 43: [2022-11-26 00:47:06,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:47:06,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 43: [2022-11-26 00:47:06,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 00:47:06,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:47:06,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 58: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 00:47:06,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 00:47:06,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 00:47:06,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 16: [2022-11-26 00:47:06,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 16: [2022-11-26 00:47:06,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 33: [2022-11-26 00:47:06,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 00:47:06,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 00:47:06,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 14: [2022-11-26 00:47:06,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 00:47:06,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 00:47:06,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 31: [2022-11-26 00:47:06,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 00:47:06,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 00:47:06,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 24: [2022-11-26 00:47:06,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:47:06,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 00:47:06,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 24: [2022-11-26 00:47:06,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 1: [2022-11-26 00:47:06,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 00:47:06,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 24: [2022-11-26 00:47:06,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 1: [2022-11-26 00:47:06,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 59: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 59: [2022-11-26 00:47:06,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 40: [2022-11-26 00:47:06,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 40: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 46: [2022-11-26 00:47:06,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 51: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 00:47:06,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 57: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 00:47:06,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 00:47:06,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 22: [2022-11-26 00:47:06,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 15: [2022-11-26 00:47:06,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 21: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 54: [2022-11-26 00:47:06,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 48: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 21: [2022-11-26 00:47:06,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 48: [2022-11-26 00:47:06,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 48: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 41: [2022-11-26 00:47:06,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 56: [2022-11-26 00:47:06,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 00:47:06,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 00:47:06,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 54: [2022-11-26 00:47:06,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 35: [2022-11-26 00:47:06,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 00:47:06,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 00:47:06,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 3: [2022-11-26 00:47:06,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 00:47:06,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 00:47:06,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 44: [2022-11-26 00:47:06,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 7: [2022-11-26 00:47:06,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 44: [2022-11-26 00:47:06,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 00:47:06,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 7: [2022-11-26 00:47:06,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 00:47:06,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 42: [2022-11-26 00:47:06,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 00:47:06,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 29: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 42: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 45: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 00:47:06,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 63: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 00:47:06,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 53: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 00:47:06,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 00:47:06,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 00:47:06,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 00:47:06,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 20: [2022-11-26 00:47:06,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 49: [2022-11-26 00:47:06,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 00:47:06,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 4: [2022-11-26 00:47:06,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 49: [2022-11-26 00:47:06,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 18: [2022-11-26 00:47:06,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 00:47:06,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 00:47:06,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 4: [2022-11-26 00:47:06,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 00:47:06,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 29: [2022-11-26 00:47:06,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 00:47:06,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 00:47:06,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 22: [2022-11-26 00:47:06,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 00:47:06,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 00:47:06,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 13: [2022-11-26 00:47:06,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 00:47:06,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step11000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 00:47:06,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! 0: successfully saved checkpoint at iteration 11000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5202.32 63: iteration 11010/ 24424 | consumed samples: 5637120 | consumed tokens: 11544821760 | elapsed time per iteration (s): 2.84 | learning rate: 1.254E-04 | global batch size: 512 | lm loss: 2.136363E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.597 | TFLOPs: 18.59 | 63: iteration 11020/ 24424 | consumed samples: 5642240 | consumed tokens: 11555307520 | elapsed time per iteration (s): 2.25 | learning rate: 1.253E-04 | global batch size: 512 | lm loss: 2.122369E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.101 | TFLOPs: 23.38 | 63: iteration 11030/ 24424 | consumed samples: 5647360 | consumed tokens: 11565793280 | elapsed time per iteration (s): 2.24 | learning rate: 1.252E-04 | global batch size: 512 | lm loss: 2.127438E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.083 | TFLOPs: 23.48 | 63: iteration 11040/ 24424 | consumed samples: 5652480 | consumed tokens: 11576279040 | elapsed time per iteration (s): 2.26 | learning rate: 1.251E-04 | global batch size: 512 | lm loss: 2.126699E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.343 | TFLOPs: 23.30 | 63: iteration 11050/ 24424 | consumed samples: 5657600 | consumed tokens: 11586764800 | elapsed time per iteration (s): 2.25 | learning rate: 1.250E-04 | global batch size: 512 | lm loss: 2.142883E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.613 | TFLOPs: 23.43 | 63: iteration 11060/ 24424 | consumed samples: 5662720 | consumed tokens: 11597250560 | elapsed time per iteration (s): 2.25 | learning rate: 1.248E-04 | global batch size: 512 | lm loss: 2.140988E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.809 | TFLOPs: 23.45 | 63: iteration 11070/ 24424 | consumed samples: 5667840 | consumed tokens: 11607736320 | elapsed time per iteration (s): 2.35 | learning rate: 1.247E-04 | global batch size: 512 | lm loss: 2.121771E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.003 | TFLOPs: 22.44 | 63: iteration 11080/ 24424 | consumed samples: 5672960 | consumed tokens: 11618222080 | elapsed time per iteration (s): 2.25 | learning rate: 1.246E-04 | global batch size: 512 | lm loss: 2.136876E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.295 | TFLOPs: 23.40 | 63: iteration 11090/ 24424 | consumed samples: 5678080 | consumed tokens: 11628707840 | elapsed time per iteration (s): 2.88 | learning rate: 1.245E-04 | global batch size: 512 | lm loss: 2.119523E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.489 | TFLOPs: 18.27 | 63: iteration 11100/ 24424 | consumed samples: 5683200 | consumed tokens: 11639193600 | elapsed time per iteration (s): 2.42 | learning rate: 1.244E-04 | global batch size: 512 | lm loss: 2.136901E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 211.234 | TFLOPs: 21.75 | 63: iteration 11110/ 24424 | consumed samples: 5688320 | consumed tokens: 11649679360 | elapsed time per iteration (s): 2.28 | learning rate: 1.243E-04 | global batch size: 512 | lm loss: 2.131836E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.930 | TFLOPs: 23.16 | 63: iteration 11120/ 24424 | consumed samples: 5693440 | consumed tokens: 11660165120 | elapsed time per iteration (s): 2.25 | learning rate: 1.241E-04 | global batch size: 512 | lm loss: 2.143591E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.737 | TFLOPs: 23.44 | 63: iteration 11130/ 24424 | consumed samples: 5698560 | consumed tokens: 11670650880 | elapsed time per iteration (s): 2.27 | learning rate: 1.240E-04 | global batch size: 512 | lm loss: 2.111074E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.261 | TFLOPs: 23.19 | 63: iteration 11140/ 24424 | consumed samples: 5703680 | consumed tokens: 11681136640 | elapsed time per iteration (s): 2.26 | learning rate: 1.239E-04 | global batch size: 512 | lm loss: 2.146800E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.029 | TFLOPs: 23.37 | 63: iteration 11150/ 24424 | consumed samples: 5708800 | consumed tokens: 11691622400 | elapsed time per iteration (s): 2.26 | learning rate: 1.238E-04 | global batch size: 512 | lm loss: 2.138079E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.968 | TFLOPs: 23.37 | 63: iteration 11160/ 24424 | consumed samples: 5713920 | consumed tokens: 11702108160 | elapsed time per iteration (s): 2.24 | learning rate: 1.237E-04 | global batch size: 512 | lm loss: 2.114909E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.479 | TFLOPs: 23.52 | 63: iteration 11170/ 24424 | consumed samples: 5719040 | consumed tokens: 11712593920 | elapsed time per iteration (s): 2.25 | learning rate: 1.236E-04 | global batch size: 512 | lm loss: 2.121316E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.047 | TFLOPs: 23.48 | 63: iteration 11180/ 24424 | consumed samples: 5724160 | consumed tokens: 11723079680 | elapsed time per iteration (s): 2.26 | learning rate: 1.234E-04 | global batch size: 512 | lm loss: 2.100902E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.133 | TFLOPs: 23.28 | 63: iteration 11190/ 24424 | consumed samples: 5729280 | consumed tokens: 11733565440 | elapsed time per iteration (s): 2.23 | learning rate: 1.233E-04 | global batch size: 512 | lm loss: 2.113890E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.782 | TFLOPs: 23.65 | 63: iteration 11200/ 24424 | consumed samples: 5734400 | consumed tokens: 11744051200 | elapsed time per iteration (s): 2.24 | learning rate: 1.232E-04 | global batch size: 512 | lm loss: 2.094890E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.230 | TFLOPs: 23.50 | 63: iteration 11210/ 24424 | consumed samples: 5739520 | consumed tokens: 11754536960 | elapsed time per iteration (s): 2.24 | learning rate: 1.231E-04 | global batch size: 512 | lm loss: 2.104043E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.370 | TFLOPs: 23.51 | 63: iteration 11220/ 24424 | consumed samples: 5744640 | consumed tokens: 11765022720 | elapsed time per iteration (s): 4.56 | learning rate: 1.230E-04 | global batch size: 512 | lm loss: 2.105530E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 112.181 | TFLOPs: 11.55 | 63: iteration 11230/ 24424 | consumed samples: 5749760 | consumed tokens: 11775508480 | elapsed time per iteration (s): 2.26 | learning rate: 1.229E-04 | global batch size: 512 | lm loss: 2.134060E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.670 | TFLOPs: 23.33 | 63: iteration 11240/ 24424 | consumed samples: 5754880 | consumed tokens: 11785994240 | elapsed time per iteration (s): 2.32 | learning rate: 1.228E-04 | global batch size: 512 | lm loss: 2.123599E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.328 | TFLOPs: 22.68 | 63: iteration 11250/ 24424 | consumed samples: 5760000 | consumed tokens: 11796480000 | elapsed time per iteration (s): 2.25 | learning rate: 1.226E-04 | global batch size: 512 | lm loss: 2.111858E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.465 | TFLOPs: 23.42 | 63: iteration 11260/ 24424 | consumed samples: 5765120 | consumed tokens: 11806965760 | elapsed time per iteration (s): 2.24 | learning rate: 1.225E-04 | global batch size: 512 | lm loss: 2.132460E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.042 | TFLOPs: 23.58 | 63: iteration 11270/ 24424 | consumed samples: 5770240 | consumed tokens: 11817451520 | elapsed time per iteration (s): 2.23 | learning rate: 1.224E-04 | global batch size: 512 | lm loss: 2.126635E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.558 | TFLOPs: 23.63 | 63: iteration 11280/ 24424 | consumed samples: 5775360 | consumed tokens: 11827937280 | elapsed time per iteration (s): 2.24 | learning rate: 1.223E-04 | global batch size: 512 | lm loss: 2.132938E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.454 | TFLOPs: 23.52 | 63: iteration 11290/ 24424 | consumed samples: 5780480 | consumed tokens: 11838423040 | elapsed time per iteration (s): 2.24 | learning rate: 1.222E-04 | global batch size: 512 | lm loss: 2.104314E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.291 | TFLOPs: 23.50 | 63: iteration 11300/ 24424 | consumed samples: 5785600 | consumed tokens: 11848908800 | elapsed time per iteration (s): 2.26 | learning rate: 1.221E-04 | global batch size: 512 | lm loss: 2.119011E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.443 | TFLOPs: 23.31 | 63: iteration 11310/ 24424 | consumed samples: 5790720 | consumed tokens: 11859394560 | elapsed time per iteration (s): 2.23 | learning rate: 1.219E-04 | global batch size: 512 | lm loss: 2.104654E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.025 | TFLOPs: 23.68 | 63: iteration 11320/ 24424 | consumed samples: 5795840 | consumed tokens: 11869880320 | elapsed time per iteration (s): 2.26 | learning rate: 1.218E-04 | global batch size: 512 | lm loss: 2.123450E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.990 | TFLOPs: 23.37 | 63: iteration 11330/ 24424 | consumed samples: 5800960 | consumed tokens: 11880366080 | elapsed time per iteration (s): 2.24 | learning rate: 1.217E-04 | global batch size: 512 | lm loss: 2.113588E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.547 | TFLOPs: 23.53 | 63: iteration 11340/ 24424 | consumed samples: 5806080 | consumed tokens: 11890851840 | elapsed time per iteration (s): 2.25 | learning rate: 1.216E-04 | global batch size: 512 | lm loss: 2.117344E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.093 | TFLOPs: 23.38 | 63: iteration 11350/ 24424 | consumed samples: 5811200 | consumed tokens: 11901337600 | elapsed time per iteration (s): 2.25 | learning rate: 1.215E-04 | global batch size: 512 | lm loss: 2.127076E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.323 | TFLOPs: 23.40 | 63: iteration 11360/ 24424 | consumed samples: 5816320 | consumed tokens: 11911823360 | elapsed time per iteration (s): 2.24 | learning rate: 1.214E-04 | global batch size: 512 | lm loss: 2.118501E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.265 | TFLOPs: 23.50 | 63: iteration 11370/ 24424 | consumed samples: 5821440 | consumed tokens: 11922309120 | elapsed time per iteration (s): 2.23 | learning rate: 1.212E-04 | global batch size: 512 | lm loss: 2.097449E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.895 | TFLOPs: 23.67 | 63: iteration 11380/ 24424 | consumed samples: 5826560 | consumed tokens: 11932794880 | elapsed time per iteration (s): 2.23 | learning rate: 1.211E-04 | global batch size: 512 | lm loss: 2.117517E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.481 | TFLOPs: 23.62 | 63: iteration 11390/ 24424 | consumed samples: 5831680 | consumed tokens: 11943280640 | elapsed time per iteration (s): 2.30 | learning rate: 1.210E-04 | global batch size: 512 | lm loss: 2.125626E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.936 | TFLOPs: 22.95 | 63: iteration 11400/ 24424 | consumed samples: 5836800 | consumed tokens: 11953766400 | elapsed time per iteration (s): 4.07 | learning rate: 1.209E-04 | global batch size: 512 | lm loss: 2.122549E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 125.673 | TFLOPs: 12.94 | 63: iteration 11410/ 24424 | consumed samples: 5841920 | consumed tokens: 11964252160 | elapsed time per iteration (s): 2.24 | learning rate: 1.208E-04 | global batch size: 512 | lm loss: 2.118328E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.800 | TFLOPs: 23.55 | 63: iteration 11420/ 24424 | consumed samples: 5847040 | consumed tokens: 11974737920 | elapsed time per iteration (s): 2.26 | learning rate: 1.207E-04 | global batch size: 512 | lm loss: 2.109265E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.544 | TFLOPs: 23.32 | 63: iteration 11430/ 24424 | consumed samples: 5852160 | consumed tokens: 11985223680 | elapsed time per iteration (s): 2.24 | learning rate: 1.206E-04 | global batch size: 512 | lm loss: 2.109364E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.891 | TFLOPs: 23.56 | 63: iteration 11440/ 24424 | consumed samples: 5857280 | consumed tokens: 11995709440 | elapsed time per iteration (s): 2.25 | learning rate: 1.204E-04 | global batch size: 512 | lm loss: 2.130076E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.372 | TFLOPs: 23.41 | 63: iteration 11450/ 24424 | consumed samples: 5862400 | consumed tokens: 12006195200 | elapsed time per iteration (s): 2.24 | learning rate: 1.203E-04 | global batch size: 512 | lm loss: 2.121226E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.103 | TFLOPs: 23.48 | 63: iteration 11460/ 24424 | consumed samples: 5867520 | consumed tokens: 12016680960 | elapsed time per iteration (s): 2.25 | learning rate: 1.202E-04 | global batch size: 512 | lm loss: 2.114762E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.872 | TFLOPs: 23.46 | 63: iteration 11470/ 24424 | consumed samples: 5872640 | consumed tokens: 12027166720 | elapsed time per iteration (s): 2.25 | learning rate: 1.201E-04 | global batch size: 512 | lm loss: 2.095688E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.685 | TFLOPs: 23.44 | 63: iteration 11480/ 24424 | consumed samples: 5877760 | consumed tokens: 12037652480 | elapsed time per iteration (s): 2.23 | learning rate: 1.200E-04 | global batch size: 512 | lm loss: 2.113737E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.632 | TFLOPs: 23.64 | 63: iteration 11490/ 24424 | consumed samples: 5882880 | consumed tokens: 12048138240 | elapsed time per iteration (s): 2.24 | learning rate: 1.199E-04 | global batch size: 512 | lm loss: 2.091641E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.216 | TFLOPs: 23.49 | 63: iteration 11500/ 24424 | consumed samples: 5888000 | consumed tokens: 12058624000 | elapsed time per iteration (s): 2.23 | learning rate: 1.197E-04 | global batch size: 512 | lm loss: 2.104352E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.671 | TFLOPs: 23.64 | 63: iteration 11510/ 24424 | consumed samples: 5893120 | consumed tokens: 12069109760 | elapsed time per iteration (s): 2.25 | learning rate: 1.196E-04 | global batch size: 512 | lm loss: 2.109480E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.834 | TFLOPs: 23.45 | 63: iteration 11520/ 24424 | consumed samples: 5898240 | consumed tokens: 12079595520 | elapsed time per iteration (s): 2.24 | learning rate: 1.195E-04 | global batch size: 512 | lm loss: 2.103110E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.425 | TFLOPs: 23.52 | 63: iteration 11530/ 24424 | consumed samples: 5903360 | consumed tokens: 12090081280 | elapsed time per iteration (s): 2.31 | learning rate: 1.194E-04 | global batch size: 512 | lm loss: 2.106622E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.252 | TFLOPs: 22.78 | 63: iteration 11540/ 24424 | consumed samples: 5908480 | consumed tokens: 12100567040 | elapsed time per iteration (s): 2.24 | learning rate: 1.193E-04 | global batch size: 512 | lm loss: 2.115388E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.516 | TFLOPs: 23.52 | 63: iteration 11550/ 24424 | consumed samples: 5913600 | consumed tokens: 12111052800 | elapsed time per iteration (s): 2.23 | learning rate: 1.192E-04 | global batch size: 512 | lm loss: 2.090897E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.445 | TFLOPs: 23.62 | 63: iteration 11560/ 24424 | consumed samples: 5918720 | consumed tokens: 12121538560 | elapsed time per iteration (s): 2.27 | learning rate: 1.190E-04 | global batch size: 512 | lm loss: 2.089106E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.037 | TFLOPs: 23.27 | 63: iteration 11570/ 24424 | consumed samples: 5923840 | consumed tokens: 12132024320 | elapsed time per iteration (s): 2.24 | learning rate: 1.189E-04 | global batch size: 512 | lm loss: 2.122937E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.127 | TFLOPs: 23.48 | 63: iteration 11580/ 24424 | consumed samples: 5928960 | consumed tokens: 12142510080 | elapsed time per iteration (s): 2.23 | learning rate: 1.188E-04 | global batch size: 512 | lm loss: 2.078653E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.915 | TFLOPs: 23.67 | 63: iteration 11590/ 24424 | consumed samples: 5934080 | consumed tokens: 12152995840 | elapsed time per iteration (s): 2.23 | learning rate: 1.187E-04 | global batch size: 512 | lm loss: 2.118423E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.795 | TFLOPs: 23.66 | 63: iteration 11600/ 24424 | consumed samples: 5939200 | consumed tokens: 12163481600 | elapsed time per iteration (s): 2.24 | learning rate: 1.186E-04 | global batch size: 512 | lm loss: 2.121918E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.048 | TFLOPs: 23.58 | 63: iteration 11610/ 24424 | consumed samples: 5944320 | consumed tokens: 12173967360 | elapsed time per iteration (s): 2.23 | learning rate: 1.185E-04 | global batch size: 512 | lm loss: 2.099246E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.834 | TFLOPs: 23.66 | 63: iteration 11620/ 24424 | consumed samples: 5949440 | consumed tokens: 12184453120 | elapsed time per iteration (s): 2.26 | learning rate: 1.183E-04 | global batch size: 512 | lm loss: 2.099561E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.163 | TFLOPs: 23.28 | 63: iteration 11630/ 24424 | consumed samples: 5954560 | consumed tokens: 12194938880 | elapsed time per iteration (s): 2.28 | learning rate: 1.182E-04 | global batch size: 512 | lm loss: 2.109617E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.949 | TFLOPs: 23.16 | 63: iteration 11640/ 24424 | consumed samples: 5959680 | consumed tokens: 12205424640 | elapsed time per iteration (s): 2.31 | learning rate: 1.181E-04 | global batch size: 512 | lm loss: 2.114369E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.059 | TFLOPs: 22.86 | 63: iteration 11650/ 24424 | consumed samples: 5964800 | consumed tokens: 12215910400 | elapsed time per iteration (s): 2.26 | learning rate: 1.180E-04 | global batch size: 512 | lm loss: 2.113902E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.678 | TFLOPs: 23.34 | 63: iteration 11660/ 24424 | consumed samples: 5969920 | consumed tokens: 12226396160 | elapsed time per iteration (s): 2.23 | learning rate: 1.179E-04 | global batch size: 512 | lm loss: 2.112600E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.702 | TFLOPs: 23.65 | 63: iteration 11670/ 24424 | consumed samples: 5975040 | consumed tokens: 12236881920 | elapsed time per iteration (s): 2.23 | learning rate: 1.178E-04 | global batch size: 512 | lm loss: 2.103939E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.696 | TFLOPs: 23.65 | 63: iteration 11680/ 24424 | consumed samples: 5980160 | consumed tokens: 12247367680 | elapsed time per iteration (s): 2.33 | learning rate: 1.176E-04 | global batch size: 512 | lm loss: 2.113582E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.022 | TFLOPs: 22.65 | 63: iteration 11690/ 24424 | consumed samples: 5985280 | consumed tokens: 12257853440 | elapsed time per iteration (s): 2.27 | learning rate: 1.175E-04 | global batch size: 512 | lm loss: 2.108117E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.238 | TFLOPs: 23.19 | 63: iteration 11700/ 24424 | consumed samples: 5990400 | consumed tokens: 12268339200 | elapsed time per iteration (s): 2.28 | learning rate: 1.174E-04 | global batch size: 512 | lm loss: 2.091152E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.133 | TFLOPs: 23.07 | 63: iteration 11710/ 24424 | consumed samples: 5995520 | consumed tokens: 12278824960 | elapsed time per iteration (s): 2.48 | learning rate: 1.173E-04 | global batch size: 512 | lm loss: 2.106043E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 206.088 | TFLOPs: 21.22 | 63: iteration 11720/ 24424 | consumed samples: 6000640 | consumed tokens: 12289310720 | elapsed time per iteration (s): 2.50 | learning rate: 1.172E-04 | global batch size: 512 | lm loss: 2.100431E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 204.760 | TFLOPs: 21.08 | 63: iteration 11730/ 24424 | consumed samples: 6005760 | consumed tokens: 12299796480 | elapsed time per iteration (s): 2.27 | learning rate: 1.171E-04 | global batch size: 512 | lm loss: 2.091904E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.714 | TFLOPs: 23.24 | 63: iteration 11740/ 24424 | consumed samples: 6010880 | consumed tokens: 12310282240 | elapsed time per iteration (s): 5.31 | learning rate: 1.169E-04 | global batch size: 512 | lm loss: 2.121098E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 96.341 | TFLOPs: 9.92 | 63: iteration 11750/ 24424 | consumed samples: 6016000 | consumed tokens: 12320768000 | elapsed time per iteration (s): 2.24 | learning rate: 1.168E-04 | global batch size: 512 | lm loss: 2.111580E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.594 | TFLOPs: 23.53 | 63: iteration 11760/ 24424 | consumed samples: 6021120 | consumed tokens: 12331253760 | elapsed time per iteration (s): 2.26 | learning rate: 1.167E-04 | global batch size: 512 | lm loss: 2.102463E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.984 | TFLOPs: 23.37 | 63: iteration 11770/ 24424 | consumed samples: 6026240 | consumed tokens: 12341739520 | elapsed time per iteration (s): 2.23 | learning rate: 1.166E-04 | global batch size: 512 | lm loss: 2.146014E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.944 | TFLOPs: 23.67 | 63: iteration 11780/ 24424 | consumed samples: 6031360 | consumed tokens: 12352225280 | elapsed time per iteration (s): 2.23 | learning rate: 1.165E-04 | global batch size: 512 | lm loss: 2.133616E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.311 | TFLOPs: 23.61 | 63: iteration 11790/ 24424 | consumed samples: 6036480 | consumed tokens: 12362711040 | elapsed time per iteration (s): 2.27 | learning rate: 1.164E-04 | global batch size: 512 | lm loss: 2.095840E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.481 | TFLOPs: 23.21 | 63: iteration 11800/ 24424 | consumed samples: 6041600 | consumed tokens: 12373196800 | elapsed time per iteration (s): 2.23 | learning rate: 1.162E-04 | global batch size: 512 | lm loss: 2.092270E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.932 | TFLOPs: 23.67 | 63: iteration 11810/ 24424 | consumed samples: 6046720 | consumed tokens: 12383682560 | elapsed time per iteration (s): 2.25 | learning rate: 1.161E-04 | global batch size: 512 | lm loss: 2.122198E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.179 | TFLOPs: 23.39 | 63: iteration 11820/ 24424 | consumed samples: 6051840 | consumed tokens: 12394168320 | elapsed time per iteration (s): 2.26 | learning rate: 1.160E-04 | global batch size: 512 | lm loss: 2.092270E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.551 | TFLOPs: 23.32 | 63: iteration 11830/ 24424 | consumed samples: 6056960 | consumed tokens: 12404654080 | elapsed time per iteration (s): 2.23 | learning rate: 1.159E-04 | global batch size: 512 | lm loss: 2.085239E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.664 | TFLOPs: 23.64 | 63: iteration 11840/ 24424 | consumed samples: 6062080 | consumed tokens: 12415139840 | elapsed time per iteration (s): 2.24 | learning rate: 1.158E-04 | global batch size: 512 | lm loss: 2.096828E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.320 | TFLOPs: 23.50 | 63: iteration 11850/ 24424 | consumed samples: 6067200 | consumed tokens: 12425625600 | elapsed time per iteration (s): 2.25 | learning rate: 1.157E-04 | global batch size: 512 | lm loss: 2.110431E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.883 | TFLOPs: 23.46 | 63: iteration 11860/ 24424 | consumed samples: 6072320 | consumed tokens: 12436111360 | elapsed time per iteration (s): 2.36 | learning rate: 1.155E-04 | global batch size: 512 | lm loss: 2.098637E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 216.594 | TFLOPs: 22.30 | 63: iteration 11870/ 24424 | consumed samples: 6077440 | consumed tokens: 12446597120 | elapsed time per iteration (s): 2.25 | learning rate: 1.154E-04 | global batch size: 512 | lm loss: 2.093042E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.016 | TFLOPs: 23.47 | 63: iteration 11880/ 24424 | consumed samples: 6082560 | consumed tokens: 12457082880 | elapsed time per iteration (s): 2.26 | learning rate: 1.153E-04 | global batch size: 512 | lm loss: 2.086629E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.952 | TFLOPs: 23.36 | 63: iteration 11890/ 24424 | consumed samples: 6087680 | consumed tokens: 12467568640 | elapsed time per iteration (s): 2.23 | learning rate: 1.152E-04 | global batch size: 512 | lm loss: 2.121098E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.789 | TFLOPs: 23.66 | 63: iteration 11900/ 24424 | consumed samples: 6092800 | consumed tokens: 12478054400 | elapsed time per iteration (s): 2.28 | learning rate: 1.151E-04 | global batch size: 512 | lm loss: 2.114804E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.929 | TFLOPs: 23.16 | 63: iteration 11910/ 24424 | consumed samples: 6097920 | consumed tokens: 12488540160 | elapsed time per iteration (s): 2.27 | learning rate: 1.150E-04 | global batch size: 512 | lm loss: 2.088540E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.351 | TFLOPs: 23.20 | 63: iteration 11920/ 24424 | consumed samples: 6103040 | consumed tokens: 12499025920 | elapsed time per iteration (s): 2.23 | learning rate: 1.148E-04 | global batch size: 512 | lm loss: 2.085775E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.915 | TFLOPs: 23.67 | 63: iteration 11930/ 24424 | consumed samples: 6108160 | consumed tokens: 12509511680 | elapsed time per iteration (s): 2.26 | learning rate: 1.147E-04 | global batch size: 512 | lm loss: 2.090944E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.147 | TFLOPs: 23.28 | 63: iteration 11940/ 24424 | consumed samples: 6113280 | consumed tokens: 12519997440 | elapsed time per iteration (s): 2.29 | learning rate: 1.146E-04 | global batch size: 512 | lm loss: 2.112251E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.479 | TFLOPs: 23.01 | 63: iteration 11950/ 24424 | consumed samples: 6118400 | consumed tokens: 12530483200 | elapsed time per iteration (s): 2.23 | learning rate: 1.145E-04 | global batch size: 512 | lm loss: 2.121936E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.300 | TFLOPs: 23.61 | 63: iteration 11960/ 24424 | consumed samples: 6123520 | consumed tokens: 12540968960 | elapsed time per iteration (s): 2.28 | learning rate: 1.144E-04 | global batch size: 512 | lm loss: 2.087391E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.316 | TFLOPs: 23.09 | 63: iteration 11970/ 24424 | consumed samples: 6128640 | consumed tokens: 12551454720 | elapsed time per iteration (s): 2.26 | learning rate: 1.143E-04 | global batch size: 512 | lm loss: 2.105808E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.128 | TFLOPs: 23.28 | 63: iteration 11980/ 24424 | consumed samples: 6133760 | consumed tokens: 12561940480 | elapsed time per iteration (s): 2.25 | learning rate: 1.141E-04 | global batch size: 512 | lm loss: 2.085904E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.143 | TFLOPs: 23.38 | 63: iteration 11990/ 24424 | consumed samples: 6138880 | consumed tokens: 12572426240 | elapsed time per iteration (s): 2.34 | learning rate: 1.140E-04 | global batch size: 512 | lm loss: 2.108520E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.737 | TFLOPs: 22.52 | 0: [2022-11-26 01:26:06,747] [INFO] [logging.py:68:log_dist] [Rank 0] step=12000, skipped=0, lr=[0.00011391020919668837, 0.00011391020919668837, 0.00011391020919668837], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 12000/ 24424 | consumed samples: 6144000 | consumed tokens: 12582912000 | elapsed time per iteration (s): 2.27 | learning rate: 1.139E-04 | global batch size: 512 | lm loss: 2.100971E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.890 | TFLOPs: 23.25 | 0: steps: 12000 loss: 2.1461 iter time (s): 2.325 samples/sec: 220.172 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 12000 | lm loss value: 2.057033E+00 | lm loss PPL: 7.822726E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 12000 to checkpoints_3b9 0: [2022-11-26 01:26:07,482] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is begin to save! 0: [2022-11-26 01:26:07,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_01-model_00-model_states.pt... 32: [2022-11-26 01:26:07,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_21-model_00-model_states.pt... 32: [2022-11-26 01:26:07,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_21-model_00-model_states.pt. 32: [2022-11-26 01:26:07,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_22-model_00-model_states.pt... 0: [2022-11-26 01:26:07,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_01-model_00-model_states.pt. 0: [2022-11-26 01:26:07,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_03-model_00-model_states.pt... 32: [2022-11-26 01:26:07,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_22-model_00-model_states.pt. 32: [2022-11-26 01:26:07,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_23-model_00-model_states.pt... 0: [2022-11-26 01:26:08,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_03-model_00-model_states.pt. 0: [2022-11-26 01:26:08,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_04-model_00-model_states.pt... 32: [2022-11-26 01:26:08,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_23-model_00-model_states.pt. 32: [2022-11-26 01:26:08,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_24-model_00-model_states.pt... 0: [2022-11-26 01:26:08,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_04-model_00-model_states.pt. 0: [2022-11-26 01:26:08,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_05-model_00-model_states.pt... 32: [2022-11-26 01:26:08,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_24-model_00-model_states.pt. 32: [2022-11-26 01:26:08,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_25-model_00-model_states.pt... 0: [2022-11-26 01:26:08,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_05-model_00-model_states.pt. 0: [2022-11-26 01:26:08,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_06-model_00-model_states.pt... 32: [2022-11-26 01:26:08,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_25-model_00-model_states.pt. 32: [2022-11-26 01:26:08,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_26-model_00-model_states.pt... 0: [2022-11-26 01:26:08,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_06-model_00-model_states.pt. 0: [2022-11-26 01:26:08,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_07-model_00-model_states.pt... 32: [2022-11-26 01:26:08,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_26-model_00-model_states.pt. 32: [2022-11-26 01:26:08,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_27-model_00-model_states.pt... 0: [2022-11-26 01:26:09,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_07-model_00-model_states.pt. 0: [2022-11-26 01:26:09,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_08-model_00-model_states.pt... 32: [2022-11-26 01:26:09,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_27-model_00-model_states.pt. 32: [2022-11-26 01:26:09,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_28-model_00-model_states.pt... 0: [2022-11-26 01:26:09,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_08-model_00-model_states.pt. 0: [2022-11-26 01:26:09,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_09-model_00-model_states.pt... 32: [2022-11-26 01:26:09,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_28-model_00-model_states.pt. 32: [2022-11-26 01:26:09,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_29-model_00-model_states.pt... 0: [2022-11-26 01:26:09,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_09-model_00-model_states.pt. 0: [2022-11-26 01:26:09,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_10-model_00-model_states.pt... 32: [2022-11-26 01:26:09,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_29-model_00-model_states.pt. 32: [2022-11-26 01:26:09,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_30-model_00-model_states.pt... 0: [2022-11-26 01:26:09,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_10-model_00-model_states.pt. 0: [2022-11-26 01:26:09,834] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_11-model_00-model_states.pt... 32: [2022-11-26 01:26:09,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_30-model_00-model_states.pt. 32: [2022-11-26 01:26:09,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_31-model_00-model_states.pt... 0: [2022-11-26 01:26:10,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_11-model_00-model_states.pt. 0: [2022-11-26 01:26:10,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_12-model_00-model_states.pt... 32: [2022-11-26 01:26:10,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_31-model_00-model_states.pt. 32: [2022-11-26 01:26:10,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_32-model_00-model_states.pt... 0: [2022-11-26 01:26:10,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_12-model_00-model_states.pt. 0: [2022-11-26 01:26:10,281] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_13-model_00-model_states.pt... 32: [2022-11-26 01:26:10,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_32-model_00-model_states.pt. 32: [2022-11-26 01:26:10,300] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_33-model_00-model_states.pt... 0: [2022-11-26 01:26:10,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_13-model_00-model_states.pt. 0: [2022-11-26 01:26:10,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_14-model_00-model_states.pt... 32: [2022-11-26 01:26:10,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_33-model_00-model_states.pt. 32: [2022-11-26 01:26:10,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_34-model_00-model_states.pt... 0: [2022-11-26 01:26:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_14-model_00-model_states.pt. 0: [2022-11-26 01:26:10,720] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_15-model_00-model_states.pt... 32: [2022-11-26 01:26:10,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_34-model_00-model_states.pt. 32: [2022-11-26 01:26:10,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_35-model_00-model_states.pt... 0: [2022-11-26 01:26:10,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_15-model_00-model_states.pt. 0: [2022-11-26 01:26:10,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_16-model_00-model_states.pt... 32: [2022-11-26 01:26:10,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_35-model_00-model_states.pt. 32: [2022-11-26 01:26:10,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_36-model_00-model_states.pt... 0: [2022-11-26 01:26:11,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_16-model_00-model_states.pt. 0: [2022-11-26 01:26:11,152] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_17-model_00-model_states.pt... 32: [2022-11-26 01:26:11,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_36-model_00-model_states.pt. 32: [2022-11-26 01:26:11,208] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_37-model_00-model_states.pt... 0: [2022-11-26 01:26:11,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_17-model_00-model_states.pt. 0: [2022-11-26 01:26:11,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_18-model_00-model_states.pt... 32: [2022-11-26 01:26:11,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_37-model_00-model_states.pt. 32: [2022-11-26 01:26:11,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_38-model_00-model_states.pt... 0: [2022-11-26 01:26:11,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_18-model_00-model_states.pt. 0: [2022-11-26 01:26:11,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_19-model_00-model_states.pt... 32: [2022-11-26 01:26:11,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_38-model_00-model_states.pt. 32: [2022-11-26 01:26:11,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_40-model_00-model_states.pt... 32: [2022-11-26 01:26:11,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_40-model_00-model_states.pt. 32: [2022-11-26 01:26:11,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/mp_rank_01_model_states.pt... 32: [2022-11-26 01:26:11,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/mp_rank_01_model_states.pt. 0: [2022-11-26 01:26:11,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_19-model_00-model_states.pt. 0: [2022-11-26 01:26:11,800] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/layer_20-model_00-model_states.pt... 0: [2022-11-26 01:26:12,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/layer_20-model_00-model_states.pt. 0: [2022-11-26 01:26:12,015] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step12000/mp_rank_00_model_states.pt 0: [2022-11-26 01:26:12,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/mp_rank_00_model_states.pt... 0: [2022-11-26 01:26:12,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/mp_rank_00_model_states.pt. 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 55: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 54: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 58: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 41: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 43: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 63: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 46: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 27: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 44: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 19: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 7: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 6: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 14: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 8: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 29: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 30: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 26: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 23: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 22: [2022-11-26 01:26:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step12000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 21: [2022-11-26 01:26:12,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 01:26:12,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 01:26:12,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 42: [2022-11-26 01:26:12,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 57: [2022-11-26 01:26:12,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 34: [2022-11-26 01:26:12,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 25: [2022-11-26 01:26:12,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 01:26:12,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 37: [2022-11-26 01:26:12,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 7: [2022-11-26 01:26:12,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 33: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 3: [2022-11-26 01:26:12,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 45: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 31: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 33: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 01:26:12,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 01:26:12,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 15: [2022-11-26 01:26:12,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 17: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 01:26:12,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 37: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 22: [2022-11-26 01:26:12,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 46: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 21: [2022-11-26 01:26:12,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 46: [2022-11-26 01:26:12,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 21: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 7: [2022-11-26 01:26:12,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 33: [2022-11-26 01:26:12,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 7: [2022-11-26 01:26:12,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 33: [2022-11-26 01:26:12,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 40: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 4: [2022-11-26 01:26:12,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 15: [2022-11-26 01:26:12,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 59: [2022-11-26 01:26:12,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 01:26:12,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 23: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 59: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 50: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 17: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 50: [2022-11-26 01:26:12,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 17: [2022-11-26 01:26:12,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 50: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 17: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 57: [2022-11-26 01:26:12,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 63: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 63: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 1: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 9: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 01:26:12,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 36: [2022-11-26 01:26:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 40: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 14: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 24: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 01:26:12,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 15: [2022-11-26 01:26:12,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 01:26:12,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 01:26:12,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 01:26:12,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 01:26:12,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 01:26:12,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 21: [2022-11-26 01:26:12,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 33: [2022-11-26 01:26:12,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 01:26:12,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 01:26:12,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 14: [2022-11-26 01:26:12,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 7: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 41: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 27: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 41: [2022-11-26 01:26:12,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 7: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 14: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 57: [2022-11-26 01:26:12,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 14: [2022-11-26 01:26:12,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 17: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 01:26:12,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 16: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 37: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 47: [2022-11-26 01:26:12,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 01:26:12,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 01:26:12,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 01:26:12,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 01:26:12,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 01:26:12,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 50: [2022-11-26 01:26:12,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 01:26:12,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 01:26:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 01:26:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 56: [2022-11-26 01:26:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 9: [2022-11-26 01:26:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 01:26:12,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 01:26:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 01:26:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 42: [2022-11-26 01:26:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 26: [2022-11-26 01:26:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 01:26:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 46: [2022-11-26 01:26:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 46: [2022-11-26 01:26:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 40: [2022-11-26 01:26:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 32: [2022-11-26 01:26:12,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 26: [2022-11-26 01:26:12,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 01:26:12,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 01:26:12,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 12: [2022-11-26 01:26:12,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 61: [2022-11-26 01:26:12,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 12: [2022-11-26 01:26:12,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 61: [2022-11-26 01:26:12,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 12: [2022-11-26 01:26:12,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 01:26:12,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 01:26:12,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 01:26:12,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 01:26:12,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 01:26:12,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 01:26:12,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 01:26:12,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 01:26:12,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 01:26:12,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 01:26:12,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 16: [2022-11-26 01:26:12,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 63: [2022-11-26 01:26:12,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 16: [2022-11-26 01:26:12,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 63: [2022-11-26 01:26:12,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 01:26:12,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 01:26:12,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 01:26:12,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 36: [2022-11-26 01:26:12,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 17: [2022-11-26 01:26:12,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 01:26:12,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 01:26:12,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 01:26:12,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 01:26:12,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 01:26:12,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 14: [2022-11-26 01:26:12,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 01:26:12,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 01:26:12,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 01:26:12,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 01:26:12,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 7: [2022-11-26 01:26:12,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 7: [2022-11-26 01:26:12,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 01:26:12,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 01:26:12,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 12: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 40: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 50: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 15: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 50: [2022-11-26 01:26:12,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 15: [2022-11-26 01:26:12,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 50: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 01:26:12,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 01:26:12,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 01:26:12,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 01:26:12,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 01:26:12,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 01:26:12,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 01:26:12,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 01:26:12,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 01:26:12,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 17: [2022-11-26 01:26:12,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 01:26:12,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 01:26:12,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 01:26:12,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 01:26:12,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 01:26:12,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 01:26:12,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 01:26:12,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 01:26:12,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 01:26:12,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 01:26:12,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 01:26:12,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 01:26:12,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 01:26:12,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 01:26:12,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 01:26:12,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 01:26:12,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 59: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 15: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 01:26:12,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 01:26:12,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 12: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 55: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 6: [2022-11-26 01:26:12,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 55: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 01:26:12,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 48: [2022-11-26 01:26:12,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 31: [2022-11-26 01:26:12,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 48: [2022-11-26 01:26:12,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 40: [2022-11-26 01:26:12,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 01:26:12,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 14: [2022-11-26 01:26:12,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 01:26:12,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-26 01:26:12,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 46: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 01:26:12,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 17: [2022-11-26 01:26:12,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 01:26:12,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 01:26:12,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 41: [2022-11-26 01:26:12,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 01:26:12,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 01:26:12,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 5: [2022-11-26 01:26:12,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 01:26:12,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 23: [2022-11-26 01:26:12,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 53: [2022-11-26 01:26:12,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 01:26:12,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 01:26:12,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 01:26:12,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 01:26:12,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 01:26:12,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 01:26:12,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 01:26:12,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 01:26:12,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 01:26:12,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 27: [2022-11-26 01:26:12,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 14: [2022-11-26 01:26:12,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 01:26:12,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 01:26:12,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 49: [2022-11-26 01:26:12,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 25: [2022-11-26 01:26:12,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 29: [2022-11-26 01:26:12,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 01:26:12,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 01:26:12,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 01:26:12,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 01:26:12,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 01:26:12,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 01:26:12,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 28: [2022-11-26 01:26:12,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 46: [2022-11-26 01:26:12,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 28: [2022-11-26 01:26:12,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 46: [2022-11-26 01:26:12,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 40: [2022-11-26 01:26:12,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 01:26:12,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 01:26:12,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 01:26:12,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 55: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 22: [2022-11-26 01:26:12,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 01:26:12,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 01:26:12,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 10: [2022-11-26 01:26:12,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 11: [2022-11-26 01:26:12,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 01:26:12,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 14: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 01:26:12,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 57: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 47: [2022-11-26 01:26:12,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 3: [2022-11-26 01:26:12,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 01:26:12,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 01:26:12,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 01:26:12,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 37: [2022-11-26 01:26:12,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 01:26:12,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 01:26:12,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 01:26:12,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 62: [2022-11-26 01:26:12,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 41: [2022-11-26 01:26:12,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 62: [2022-11-26 01:26:12,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 20: [2022-11-26 01:26:12,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 01:26:12,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 01:26:12,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 6: [2022-11-26 01:26:12,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 01:26:12,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 01:26:12,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 32: [2022-11-26 01:26:12,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 01:26:12,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 0: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 25: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 0: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 17: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 25: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 17: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 17: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 45: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 21: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 27: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 21: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 27: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 21: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 42: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 01:26:12,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 23: [2022-11-26 01:26:12,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 01:26:12,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 01:26:12,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 7: [2022-11-26 01:26:12,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 01:26:12,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 01:26:12,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 18: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 01:26:12,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 1: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 49: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 01:26:12,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 5: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 1: [2022-11-26 01:26:12,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 5: [2022-11-26 01:26:12,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 4: [2022-11-26 01:26:12,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 5: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 4: [2022-11-26 01:26:12,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 28: [2022-11-26 01:26:12,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 01:26:12,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 01:26:12,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 24: [2022-11-26 01:26:12,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 01:26:12,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 01:26:12,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 61: [2022-11-26 01:26:12,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-26 01:26:12,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 01:26:12,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 29: [2022-11-26 01:26:12,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 01:26:12,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 01:26:12,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 9: [2022-11-26 01:26:12,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 01:26:12,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 63: [2022-11-26 01:26:12,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 9: [2022-11-26 01:26:12,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: [2022-11-26 01:26:12,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 01:26:12,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 12: [2022-11-26 01:26:12,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 59: [2022-11-26 01:26:12,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 01:26:12,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 01:26:12,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 31: [2022-11-26 01:26:12,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 15: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-26 01:26:12,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 01:26:12,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 15: [2022-11-26 01:26:12,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 01:26:12,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 36: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 01:26:12,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 47: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 10: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 36: [2022-11-26 01:26:12,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 47: [2022-11-26 01:26:12,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 10: [2022-11-26 01:26:12,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 47: [2022-11-26 01:26:12,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 10: [2022-11-26 01:26:12,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 33: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 33: [2022-11-26 01:26:12,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 50: [2022-11-26 01:26:12,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 34: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 01:26:12,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 2: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 01:26:12,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 01:26:12,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 40: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 01:26:12,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 57: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 16: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 01:26:12,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 57: [2022-11-26 01:26:12,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 16: [2022-11-26 01:26:12,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 22: [2022-11-26 01:26:12,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 17: [2022-11-26 01:26:12,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 22: [2022-11-26 01:26:12,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 38: [2022-11-26 01:26:12,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 17: [2022-11-26 01:26:12,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 22: [2022-11-26 01:26:12,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 17: [2022-11-26 01:26:12,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 38: [2022-11-26 01:26:12,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 01:26:12,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 01:26:12,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 46: [2022-11-26 01:26:12,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 01:26:12,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 01:26:12,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 53: [2022-11-26 01:26:12,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 01:26:12,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 01:26:12,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 43: [2022-11-26 01:26:12,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 52: [2022-11-26 01:26:12,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 54: [2022-11-26 01:26:12,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 56: [2022-11-26 01:26:12,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 56: [2022-11-26 01:26:12,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 44: [2022-11-26 01:26:12,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 44: [2022-11-26 01:26:12,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 44: [2022-11-26 01:26:12,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 58: [2022-11-26 01:26:12,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 01:26:12,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 01:26:12,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 01:26:12,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 01:26:12,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 51: [2022-11-26 01:26:12,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 01:26:12,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 01:26:12,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 30: [2022-11-26 01:26:12,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 01:26:12,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 01:26:12,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 55: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 55: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 52: [2022-11-26 01:26:12,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 54: [2022-11-26 01:26:12,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 01:26:12,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 12: [2022-11-26 01:26:12,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 43: [2022-11-26 01:26:12,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 12: [2022-11-26 01:26:12,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 12: [2022-11-26 01:26:12,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 48: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 01:26:12,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 60: [2022-11-26 01:26:12,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 01:26:12,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 01:26:12,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 13: [2022-11-26 01:26:12,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 01:26:12,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 01:26:12,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 12: [2022-11-26 01:26:12,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 01:26:12,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 01:26:12,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 39: [2022-11-26 01:26:12,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 01:26:12,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 01:26:12,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 19: [2022-11-26 01:26:12,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 01:26:12,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 01:26:12,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 35: [2022-11-26 01:26:12,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 01:26:12,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 01:26:12,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 01:26:12,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 01:26:12,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 8: [2022-11-26 01:26:12,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 26: [2022-11-26 01:26:12,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 01:26:12,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 01:26:12,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 43: [2022-11-26 01:26:12,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 63: time (ms) | save-checkpoint: 5238.68 0: successfully saved checkpoint at iteration 12000 to checkpoints_3b9 43: [2022-11-26 01:26:12,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step12000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 01:26:12,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! 63: iteration 12010/ 24424 | consumed samples: 6149120 | consumed tokens: 12593397760 | elapsed time per iteration (s): 2.86 | learning rate: 1.138E-04 | global batch size: 512 | lm loss: 2.089235E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.073 | TFLOPs: 18.43 | 63: iteration 12020/ 24424 | consumed samples: 6154240 | consumed tokens: 12603883520 | elapsed time per iteration (s): 2.25 | learning rate: 1.137E-04 | global batch size: 512 | lm loss: 2.104481E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.213 | TFLOPs: 23.39 | 63: iteration 12030/ 24424 | consumed samples: 6159360 | consumed tokens: 12614369280 | elapsed time per iteration (s): 2.23 | learning rate: 1.136E-04 | global batch size: 512 | lm loss: 2.106518E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.157 | TFLOPs: 23.59 | 63: iteration 12040/ 24424 | consumed samples: 6164480 | consumed tokens: 12624855040 | elapsed time per iteration (s): 2.25 | learning rate: 1.134E-04 | global batch size: 512 | lm loss: 2.101763E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.385 | TFLOPs: 23.41 | 63: iteration 12050/ 24424 | consumed samples: 6169600 | consumed tokens: 12635340800 | elapsed time per iteration (s): 2.24 | learning rate: 1.133E-04 | global batch size: 512 | lm loss: 2.081112E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.523 | TFLOPs: 23.53 | 63: iteration 12060/ 24424 | consumed samples: 6174720 | consumed tokens: 12645826560 | elapsed time per iteration (s): 2.23 | learning rate: 1.132E-04 | global batch size: 512 | lm loss: 2.108657E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.014 | TFLOPs: 23.68 | 63: iteration 12070/ 24424 | consumed samples: 6179840 | consumed tokens: 12656312320 | elapsed time per iteration (s): 2.28 | learning rate: 1.131E-04 | global batch size: 512 | lm loss: 2.088242E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.935 | TFLOPs: 23.16 | 63: iteration 12080/ 24424 | consumed samples: 6184960 | consumed tokens: 12666798080 | elapsed time per iteration (s): 2.27 | learning rate: 1.130E-04 | global batch size: 512 | lm loss: 2.103186E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.039 | TFLOPs: 23.27 | 63: iteration 12090/ 24424 | consumed samples: 6190080 | consumed tokens: 12677283840 | elapsed time per iteration (s): 2.23 | learning rate: 1.129E-04 | global batch size: 512 | lm loss: 2.097826E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.782 | TFLOPs: 23.66 | 63: iteration 12100/ 24424 | consumed samples: 6195200 | consumed tokens: 12687769600 | elapsed time per iteration (s): 2.43 | learning rate: 1.127E-04 | global batch size: 512 | lm loss: 2.101202E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 210.617 | TFLOPs: 21.68 | 63: iteration 12110/ 24424 | consumed samples: 6200320 | consumed tokens: 12698255360 | elapsed time per iteration (s): 2.27 | learning rate: 1.126E-04 | global batch size: 512 | lm loss: 2.083784E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.669 | TFLOPs: 23.23 | 63: iteration 12120/ 24424 | consumed samples: 6205440 | consumed tokens: 12708741120 | elapsed time per iteration (s): 2.25 | learning rate: 1.125E-04 | global batch size: 512 | lm loss: 2.092729E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.022 | TFLOPs: 23.47 | 63: iteration 12130/ 24424 | consumed samples: 6210560 | consumed tokens: 12719226880 | elapsed time per iteration (s): 2.23 | learning rate: 1.124E-04 | global batch size: 512 | lm loss: 2.097239E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.824 | TFLOPs: 23.66 | 63: iteration 12140/ 24424 | consumed samples: 6215680 | consumed tokens: 12729712640 | elapsed time per iteration (s): 4.58 | learning rate: 1.123E-04 | global batch size: 512 | lm loss: 2.107261E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 111.737 | TFLOPs: 11.50 | 63: iteration 12150/ 24424 | consumed samples: 6220800 | consumed tokens: 12740198400 | elapsed time per iteration (s): 2.26 | learning rate: 1.122E-04 | global batch size: 512 | lm loss: 2.077281E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.857 | TFLOPs: 23.35 | 63: iteration 12160/ 24424 | consumed samples: 6225920 | consumed tokens: 12750684160 | elapsed time per iteration (s): 2.25 | learning rate: 1.120E-04 | global batch size: 512 | lm loss: 2.089893E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.273 | TFLOPs: 23.40 | 63: iteration 12170/ 24424 | consumed samples: 6231040 | consumed tokens: 12761169920 | elapsed time per iteration (s): 2.25 | learning rate: 1.119E-04 | global batch size: 512 | lm loss: 2.099887E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.620 | TFLOPs: 23.43 | 63: iteration 12180/ 24424 | consumed samples: 6236160 | consumed tokens: 12771655680 | elapsed time per iteration (s): 2.23 | learning rate: 1.118E-04 | global batch size: 512 | lm loss: 2.094482E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.707 | TFLOPs: 23.65 | 63: iteration 12190/ 24424 | consumed samples: 6241280 | consumed tokens: 12782141440 | elapsed time per iteration (s): 2.24 | learning rate: 1.117E-04 | global batch size: 512 | lm loss: 2.083575E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.573 | TFLOPs: 23.53 | 63: iteration 12200/ 24424 | consumed samples: 6246400 | consumed tokens: 12792627200 | elapsed time per iteration (s): 2.23 | learning rate: 1.116E-04 | global batch size: 512 | lm loss: 2.091117E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.358 | TFLOPs: 23.61 | 63: iteration 12210/ 24424 | consumed samples: 6251520 | consumed tokens: 12803112960 | elapsed time per iteration (s): 2.23 | learning rate: 1.115E-04 | global batch size: 512 | lm loss: 2.106001E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.978 | TFLOPs: 23.68 | 63: iteration 12220/ 24424 | consumed samples: 6256640 | consumed tokens: 12813598720 | elapsed time per iteration (s): 2.23 | learning rate: 1.113E-04 | global batch size: 512 | lm loss: 2.104857E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.392 | TFLOPs: 23.61 | 63: iteration 12230/ 24424 | consumed samples: 6261760 | consumed tokens: 12824084480 | elapsed time per iteration (s): 2.23 | learning rate: 1.112E-04 | global batch size: 512 | lm loss: 2.097569E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.673 | TFLOPs: 23.64 | 63: iteration 12240/ 24424 | consumed samples: 6266880 | consumed tokens: 12834570240 | elapsed time per iteration (s): 2.24 | learning rate: 1.111E-04 | global batch size: 512 | lm loss: 2.099079E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.189 | TFLOPs: 23.49 | 63: iteration 12250/ 24424 | consumed samples: 6272000 | consumed tokens: 12845056000 | elapsed time per iteration (s): 2.23 | learning rate: 1.110E-04 | global batch size: 512 | lm loss: 2.102000E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.302 | TFLOPs: 23.61 | 63: iteration 12260/ 24424 | consumed samples: 6277120 | consumed tokens: 12855541760 | elapsed time per iteration (s): 2.27 | learning rate: 1.109E-04 | global batch size: 512 | lm loss: 2.124466E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.420 | TFLOPs: 23.21 | 63: iteration 12270/ 24424 | consumed samples: 6282240 | consumed tokens: 12866027520 | elapsed time per iteration (s): 2.25 | learning rate: 1.108E-04 | global batch size: 512 | lm loss: 2.107049E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.641 | TFLOPs: 23.43 | 63: iteration 12280/ 24424 | consumed samples: 6287360 | consumed tokens: 12876513280 | elapsed time per iteration (s): 2.26 | learning rate: 1.106E-04 | global batch size: 512 | lm loss: 2.101435E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.992 | TFLOPs: 23.37 | 63: iteration 12290/ 24424 | consumed samples: 6292480 | consumed tokens: 12886999040 | elapsed time per iteration (s): 2.27 | learning rate: 1.105E-04 | global batch size: 512 | lm loss: 2.120553E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.855 | TFLOPs: 23.25 | 63: iteration 12300/ 24424 | consumed samples: 6297600 | consumed tokens: 12897484800 | elapsed time per iteration (s): 2.40 | learning rate: 1.104E-04 | global batch size: 512 | lm loss: 2.073809E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 213.388 | TFLOPs: 21.97 | 63: iteration 12310/ 24424 | consumed samples: 6302720 | consumed tokens: 12907970560 | elapsed time per iteration (s): 2.26 | learning rate: 1.103E-04 | global batch size: 512 | lm loss: 2.116129E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.973 | TFLOPs: 23.37 | 63: iteration 12320/ 24424 | consumed samples: 6307840 | consumed tokens: 12918456320 | elapsed time per iteration (s): 2.23 | learning rate: 1.102E-04 | global batch size: 512 | lm loss: 2.105922E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.788 | TFLOPs: 23.66 | 63: iteration 12330/ 24424 | consumed samples: 6312960 | consumed tokens: 12928942080 | elapsed time per iteration (s): 2.30 | learning rate: 1.101E-04 | global batch size: 512 | lm loss: 2.105472E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.319 | TFLOPs: 22.89 | 63: iteration 12340/ 24424 | consumed samples: 6318080 | consumed tokens: 12939427840 | elapsed time per iteration (s): 2.25 | learning rate: 1.099E-04 | global batch size: 512 | lm loss: 2.097705E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.607 | TFLOPs: 23.43 | 63: iteration 12350/ 24424 | consumed samples: 6323200 | consumed tokens: 12949913600 | elapsed time per iteration (s): 2.25 | learning rate: 1.098E-04 | global batch size: 512 | lm loss: 2.120322E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.758 | TFLOPs: 23.45 | 63: iteration 12360/ 24424 | consumed samples: 6328320 | consumed tokens: 12960399360 | elapsed time per iteration (s): 2.27 | learning rate: 1.097E-04 | global batch size: 512 | lm loss: 2.071042E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.944 | TFLOPs: 23.26 | 63: iteration 12370/ 24424 | consumed samples: 6333440 | consumed tokens: 12970885120 | elapsed time per iteration (s): 2.28 | learning rate: 1.096E-04 | global batch size: 512 | lm loss: 2.082745E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.003 | TFLOPs: 23.16 | 63: iteration 12380/ 24424 | consumed samples: 6338560 | consumed tokens: 12981370880 | elapsed time per iteration (s): 2.24 | learning rate: 1.095E-04 | global batch size: 512 | lm loss: 2.074538E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.313 | TFLOPs: 23.50 | 63: iteration 12390/ 24424 | consumed samples: 6343680 | consumed tokens: 12991856640 | elapsed time per iteration (s): 2.25 | learning rate: 1.094E-04 | global batch size: 512 | lm loss: 2.098056E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.016 | TFLOPs: 23.47 | 63: iteration 12400/ 24424 | consumed samples: 6348800 | consumed tokens: 13002342400 | elapsed time per iteration (s): 2.26 | learning rate: 1.092E-04 | global batch size: 512 | lm loss: 2.094773E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.999 | TFLOPs: 23.37 | 63: iteration 12410/ 24424 | consumed samples: 6353920 | consumed tokens: 13012828160 | elapsed time per iteration (s): 2.27 | learning rate: 1.091E-04 | global batch size: 512 | lm loss: 2.107874E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.992 | TFLOPs: 23.26 | 63: iteration 12420/ 24424 | consumed samples: 6359040 | consumed tokens: 13023313920 | elapsed time per iteration (s): 2.25 | learning rate: 1.090E-04 | global batch size: 512 | lm loss: 2.090290E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.171 | TFLOPs: 23.39 | 63: iteration 12430/ 24424 | consumed samples: 6364160 | consumed tokens: 13033799680 | elapsed time per iteration (s): 2.23 | learning rate: 1.089E-04 | global batch size: 512 | lm loss: 2.077513E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.625 | TFLOPs: 23.64 | 63: iteration 12440/ 24424 | consumed samples: 6369280 | consumed tokens: 13044285440 | elapsed time per iteration (s): 2.24 | learning rate: 1.088E-04 | global batch size: 512 | lm loss: 2.087029E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.595 | TFLOPs: 23.53 | 63: iteration 12450/ 24424 | consumed samples: 6374400 | consumed tokens: 13054771200 | elapsed time per iteration (s): 2.24 | learning rate: 1.086E-04 | global batch size: 512 | lm loss: 2.081956E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.324 | TFLOPs: 23.50 | 63: iteration 12460/ 24424 | consumed samples: 6379520 | consumed tokens: 13065256960 | elapsed time per iteration (s): 2.66 | learning rate: 1.085E-04 | global batch size: 512 | lm loss: 2.078871E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 192.133 | TFLOPs: 19.78 | 63: iteration 12470/ 24424 | consumed samples: 6384640 | consumed tokens: 13075742720 | elapsed time per iteration (s): 2.26 | learning rate: 1.084E-04 | global batch size: 512 | lm loss: 2.084746E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.832 | TFLOPs: 23.35 | 63: iteration 12480/ 24424 | consumed samples: 6389760 | consumed tokens: 13086228480 | elapsed time per iteration (s): 2.25 | learning rate: 1.083E-04 | global batch size: 512 | lm loss: 2.083971E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.557 | TFLOPs: 23.43 | 63: iteration 12490/ 24424 | consumed samples: 6394880 | consumed tokens: 13096714240 | elapsed time per iteration (s): 2.23 | learning rate: 1.082E-04 | global batch size: 512 | lm loss: 2.071241E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.392 | TFLOPs: 23.61 | 63: iteration 12500/ 24424 | consumed samples: 6400000 | consumed tokens: 13107200000 | elapsed time per iteration (s): 2.26 | learning rate: 1.081E-04 | global batch size: 512 | lm loss: 2.088543E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.197 | TFLOPs: 23.29 | 63: iteration 12510/ 24424 | consumed samples: 6405120 | consumed tokens: 13117685760 | elapsed time per iteration (s): 2.24 | learning rate: 1.079E-04 | global batch size: 512 | lm loss: 2.080306E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.065 | TFLOPs: 23.58 | 63: iteration 12520/ 24424 | consumed samples: 6410240 | consumed tokens: 13128171520 | elapsed time per iteration (s): 2.24 | learning rate: 1.078E-04 | global batch size: 512 | lm loss: 2.093711E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.543 | TFLOPs: 23.53 | 63: iteration 12530/ 24424 | consumed samples: 6415360 | consumed tokens: 13138657280 | elapsed time per iteration (s): 2.24 | learning rate: 1.077E-04 | global batch size: 512 | lm loss: 2.101290E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.848 | TFLOPs: 23.56 | 63: iteration 12540/ 24424 | consumed samples: 6420480 | consumed tokens: 13149143040 | elapsed time per iteration (s): 2.23 | learning rate: 1.076E-04 | global batch size: 512 | lm loss: 2.093369E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.891 | TFLOPs: 23.67 | 63: iteration 12550/ 24424 | consumed samples: 6425600 | consumed tokens: 13159628800 | elapsed time per iteration (s): 2.23 | learning rate: 1.075E-04 | global batch size: 512 | lm loss: 2.093408E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.394 | TFLOPs: 23.61 | 63: iteration 12560/ 24424 | consumed samples: 6430720 | consumed tokens: 13170114560 | elapsed time per iteration (s): 2.23 | learning rate: 1.074E-04 | global batch size: 512 | lm loss: 2.081352E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.560 | TFLOPs: 23.63 | 63: iteration 12570/ 24424 | consumed samples: 6435840 | consumed tokens: 13180600320 | elapsed time per iteration (s): 2.27 | learning rate: 1.072E-04 | global batch size: 512 | lm loss: 2.086634E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.863 | TFLOPs: 23.25 | 63: iteration 12580/ 24424 | consumed samples: 6440960 | consumed tokens: 13191086080 | elapsed time per iteration (s): 2.24 | learning rate: 1.071E-04 | global batch size: 512 | lm loss: 2.102804E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.596 | TFLOPs: 23.53 | 63: iteration 12590/ 24424 | consumed samples: 6446080 | consumed tokens: 13201571840 | elapsed time per iteration (s): 2.25 | learning rate: 1.070E-04 | global batch size: 512 | lm loss: 2.087858E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.561 | TFLOPs: 23.43 | 63: iteration 12600/ 24424 | consumed samples: 6451200 | consumed tokens: 13212057600 | elapsed time per iteration (s): 2.28 | learning rate: 1.069E-04 | global batch size: 512 | lm loss: 2.096683E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.875 | TFLOPs: 23.15 | 63: iteration 12610/ 24424 | consumed samples: 6456320 | consumed tokens: 13222543360 | elapsed time per iteration (s): 2.26 | learning rate: 1.068E-04 | global batch size: 512 | lm loss: 2.102318E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.959 | TFLOPs: 23.36 | 63: iteration 12620/ 24424 | consumed samples: 6461440 | consumed tokens: 13233029120 | elapsed time per iteration (s): 2.70 | learning rate: 1.067E-04 | global batch size: 512 | lm loss: 2.084249E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.406 | TFLOPs: 19.50 | 63: iteration 12630/ 24424 | consumed samples: 6466560 | consumed tokens: 13243514880 | elapsed time per iteration (s): 2.27 | learning rate: 1.065E-04 | global batch size: 512 | lm loss: 2.081598E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.335 | TFLOPs: 23.20 | 63: iteration 12640/ 24424 | consumed samples: 6471680 | consumed tokens: 13254000640 | elapsed time per iteration (s): 2.25 | learning rate: 1.064E-04 | global batch size: 512 | lm loss: 2.108115E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.484 | TFLOPs: 23.42 | 63: iteration 12650/ 24424 | consumed samples: 6476800 | consumed tokens: 13264486400 | elapsed time per iteration (s): 2.23 | learning rate: 1.063E-04 | global batch size: 512 | lm loss: 2.068846E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.690 | TFLOPs: 23.65 | 63: iteration 12660/ 24424 | consumed samples: 6481920 | consumed tokens: 13274972160 | elapsed time per iteration (s): 2.23 | learning rate: 1.062E-04 | global batch size: 512 | lm loss: 2.083052E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.906 | TFLOPs: 23.67 | 63: iteration 12670/ 24424 | consumed samples: 6487040 | consumed tokens: 13285457920 | elapsed time per iteration (s): 2.24 | learning rate: 1.061E-04 | global batch size: 512 | lm loss: 2.089622E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.135 | TFLOPs: 23.49 | 63: iteration 12680/ 24424 | consumed samples: 6492160 | consumed tokens: 13295943680 | elapsed time per iteration (s): 2.25 | learning rate: 1.060E-04 | global batch size: 512 | lm loss: 2.080707E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.599 | TFLOPs: 23.43 | 63: iteration 12690/ 24424 | consumed samples: 6497280 | consumed tokens: 13306429440 | elapsed time per iteration (s): 2.24 | learning rate: 1.058E-04 | global batch size: 512 | lm loss: 2.087053E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.131 | TFLOPs: 23.48 | 63: iteration 12700/ 24424 | consumed samples: 6502400 | consumed tokens: 13316915200 | elapsed time per iteration (s): 2.25 | learning rate: 1.057E-04 | global batch size: 512 | lm loss: 2.109446E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.559 | TFLOPs: 23.43 | 63: iteration 12710/ 24424 | consumed samples: 6507520 | consumed tokens: 13327400960 | elapsed time per iteration (s): 2.26 | learning rate: 1.056E-04 | global batch size: 512 | lm loss: 2.065633E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.323 | TFLOPs: 23.30 | 63: iteration 12720/ 24424 | consumed samples: 6512640 | consumed tokens: 13337886720 | elapsed time per iteration (s): 2.24 | learning rate: 1.055E-04 | global batch size: 512 | lm loss: 2.091605E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.199 | TFLOPs: 23.49 | 63: iteration 12730/ 24424 | consumed samples: 6517760 | consumed tokens: 13348372480 | elapsed time per iteration (s): 2.23 | learning rate: 1.054E-04 | global batch size: 512 | lm loss: 2.095542E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.287 | TFLOPs: 23.60 | 63: iteration 12740/ 24424 | consumed samples: 6522880 | consumed tokens: 13358858240 | elapsed time per iteration (s): 2.24 | learning rate: 1.053E-04 | global batch size: 512 | lm loss: 2.094297E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.284 | TFLOPs: 23.50 | 63: iteration 12750/ 24424 | consumed samples: 6528000 | consumed tokens: 13369344000 | elapsed time per iteration (s): 2.23 | learning rate: 1.051E-04 | global batch size: 512 | lm loss: 2.075430E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.705 | TFLOPs: 23.65 | 63: iteration 12760/ 24424 | consumed samples: 6533120 | consumed tokens: 13379829760 | elapsed time per iteration (s): 2.23 | learning rate: 1.050E-04 | global batch size: 512 | lm loss: 2.067123E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.609 | TFLOPs: 23.64 | 63: iteration 12770/ 24424 | consumed samples: 6538240 | consumed tokens: 13390315520 | elapsed time per iteration (s): 2.23 | learning rate: 1.049E-04 | global batch size: 512 | lm loss: 2.081796E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.137 | TFLOPs: 23.59 | 63: iteration 12780/ 24424 | consumed samples: 6543360 | consumed tokens: 13400801280 | elapsed time per iteration (s): 2.29 | learning rate: 1.048E-04 | global batch size: 512 | lm loss: 2.087612E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.339 | TFLOPs: 22.99 | 63: iteration 12790/ 24424 | consumed samples: 6548480 | consumed tokens: 13411287040 | elapsed time per iteration (s): 2.24 | learning rate: 1.047E-04 | global batch size: 512 | lm loss: 2.070629E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.786 | TFLOPs: 23.55 | 63: iteration 12800/ 24424 | consumed samples: 6553600 | consumed tokens: 13421772800 | elapsed time per iteration (s): 3.41 | learning rate: 1.046E-04 | global batch size: 512 | lm loss: 2.070890E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 150.244 | TFLOPs: 15.47 | 63: iteration 12810/ 24424 | consumed samples: 6558720 | consumed tokens: 13432258560 | elapsed time per iteration (s): 2.25 | learning rate: 1.044E-04 | global batch size: 512 | lm loss: 2.098206E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.868 | TFLOPs: 23.46 | 63: iteration 12820/ 24424 | consumed samples: 6563840 | consumed tokens: 13442744320 | elapsed time per iteration (s): 2.23 | learning rate: 1.043E-04 | global batch size: 512 | lm loss: 2.096419E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.534 | TFLOPs: 23.63 | 63: iteration 12830/ 24424 | consumed samples: 6568960 | consumed tokens: 13453230080 | elapsed time per iteration (s): 2.24 | learning rate: 1.042E-04 | global batch size: 512 | lm loss: 2.102942E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.885 | TFLOPs: 23.56 | 63: iteration 12840/ 24424 | consumed samples: 6574080 | consumed tokens: 13463715840 | elapsed time per iteration (s): 5.53 | learning rate: 1.041E-04 | global batch size: 512 | lm loss: 2.085549E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 92.588 | TFLOPs: 9.53 | 63: iteration 12850/ 24424 | consumed samples: 6579200 | consumed tokens: 13474201600 | elapsed time per iteration (s): 2.27 | learning rate: 1.040E-04 | global batch size: 512 | lm loss: 2.095443E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.381 | TFLOPs: 23.20 | 63: iteration 12860/ 24424 | consumed samples: 6584320 | consumed tokens: 13484687360 | elapsed time per iteration (s): 2.23 | learning rate: 1.039E-04 | global batch size: 512 | lm loss: 2.073565E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.843 | TFLOPs: 23.66 | 63: iteration 12870/ 24424 | consumed samples: 6589440 | consumed tokens: 13495173120 | elapsed time per iteration (s): 2.23 | learning rate: 1.037E-04 | global batch size: 512 | lm loss: 2.084946E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.707 | TFLOPs: 23.65 | 63: iteration 12880/ 24424 | consumed samples: 6594560 | consumed tokens: 13505658880 | elapsed time per iteration (s): 2.25 | learning rate: 1.036E-04 | global batch size: 512 | lm loss: 2.102766E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.451 | TFLOPs: 23.42 | 63: iteration 12890/ 24424 | consumed samples: 6599680 | consumed tokens: 13516144640 | elapsed time per iteration (s): 21.65 | learning rate: 1.035E-04 | global batch size: 512 | lm loss: 2.082286E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 23.651 | TFLOPs: 2.43 | 63: iteration 12900/ 24424 | consumed samples: 6604800 | consumed tokens: 13526630400 | elapsed time per iteration (s): 2.24 | learning rate: 1.034E-04 | global batch size: 512 | lm loss: 2.094649E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.753 | TFLOPs: 23.55 | 63: iteration 12910/ 24424 | consumed samples: 6609920 | consumed tokens: 13537116160 | elapsed time per iteration (s): 13.58 | learning rate: 1.033E-04 | global batch size: 512 | lm loss: 2.103516E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 37.712 | TFLOPs: 3.88 | 63: iteration 12920/ 24424 | consumed samples: 6615040 | consumed tokens: 13547601920 | elapsed time per iteration (s): 2.24 | learning rate: 1.032E-04 | global batch size: 512 | lm loss: 2.085799E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.431 | TFLOPs: 23.52 | 63: iteration 12930/ 24424 | consumed samples: 6620160 | consumed tokens: 13558087680 | elapsed time per iteration (s): 2.24 | learning rate: 1.030E-04 | global batch size: 512 | lm loss: 2.099624E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.405 | TFLOPs: 23.51 | 63: iteration 12940/ 24424 | consumed samples: 6625280 | consumed tokens: 13568573440 | elapsed time per iteration (s): 2.24 | learning rate: 1.029E-04 | global batch size: 512 | lm loss: 2.073578E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.667 | TFLOPs: 23.54 | 63: iteration 12950/ 24424 | consumed samples: 6630400 | consumed tokens: 13579059200 | elapsed time per iteration (s): 2.27 | learning rate: 1.028E-04 | global batch size: 512 | lm loss: 2.081878E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.584 | TFLOPs: 23.22 | 63: iteration 12960/ 24424 | consumed samples: 6635520 | consumed tokens: 13589544960 | elapsed time per iteration (s): 4.25 | learning rate: 1.027E-04 | global batch size: 512 | lm loss: 2.084485E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 120.436 | TFLOPs: 12.40 | 63: iteration 12970/ 24424 | consumed samples: 6640640 | consumed tokens: 13600030720 | elapsed time per iteration (s): 2.23 | learning rate: 1.026E-04 | global batch size: 512 | lm loss: 2.082680E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.529 | TFLOPs: 23.63 | 63: iteration 12980/ 24424 | consumed samples: 6645760 | consumed tokens: 13610516480 | elapsed time per iteration (s): 2.23 | learning rate: 1.025E-04 | global batch size: 512 | lm loss: 2.085960E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.888 | TFLOPs: 23.67 | 63: iteration 12990/ 24424 | consumed samples: 6650880 | consumed tokens: 13621002240 | elapsed time per iteration (s): 2.25 | learning rate: 1.023E-04 | global batch size: 512 | lm loss: 2.081138E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.775 | TFLOPs: 23.45 | 63: iteration 13000/ 24424 | consumed samples: 6656000 | consumed tokens: 13631488000 | elapsed time per iteration (s): 2.26 | learning rate: 1.022E-04 | global batch size: 512 | lm loss: 2.100616E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.891 | TFLOPs: 23.36 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 13000 | lm loss value: 2.029228E+00 | lm loss PPL: 7.608212E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 13000 to checkpoints_3b9 0: [2022-11-26 02:10:27,430] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step13000 is begin to save! 0: [2022-11-26 02:10:27,458] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_01-model_00-model_states.pt... 32: [2022-11-26 02:10:27,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_21-model_00-model_states.pt... 32: [2022-11-26 02:10:27,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_21-model_00-model_states.pt. 32: [2022-11-26 02:10:27,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_22-model_00-model_states.pt... 0: [2022-11-26 02:10:27,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_01-model_00-model_states.pt. 0: [2022-11-26 02:10:27,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_03-model_00-model_states.pt... 32: [2022-11-26 02:10:28,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_22-model_00-model_states.pt. 32: [2022-11-26 02:10:28,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_23-model_00-model_states.pt... 0: [2022-11-26 02:10:28,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_03-model_00-model_states.pt. 0: [2022-11-26 02:10:28,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_04-model_00-model_states.pt... 32: [2022-11-26 02:10:28,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_23-model_00-model_states.pt. 32: [2022-11-26 02:10:28,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_24-model_00-model_states.pt... 0: [2022-11-26 02:10:28,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_04-model_00-model_states.pt. 0: [2022-11-26 02:10:28,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_05-model_00-model_states.pt... 32: [2022-11-26 02:10:28,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_24-model_00-model_states.pt. 32: [2022-11-26 02:10:28,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_25-model_00-model_states.pt... 0: [2022-11-26 02:10:28,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_05-model_00-model_states.pt. 0: [2022-11-26 02:10:28,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_06-model_00-model_states.pt... 32: [2022-11-26 02:10:28,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_25-model_00-model_states.pt. 32: [2022-11-26 02:10:28,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_26-model_00-model_states.pt... 0: [2022-11-26 02:10:28,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_06-model_00-model_states.pt. 0: [2022-11-26 02:10:28,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_07-model_00-model_states.pt... 32: [2022-11-26 02:10:28,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_26-model_00-model_states.pt. 32: [2022-11-26 02:10:28,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_27-model_00-model_states.pt... 0: [2022-11-26 02:10:29,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_07-model_00-model_states.pt. 0: [2022-11-26 02:10:29,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_08-model_00-model_states.pt... 32: [2022-11-26 02:10:29,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_27-model_00-model_states.pt. 32: [2022-11-26 02:10:29,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_28-model_00-model_states.pt... 0: [2022-11-26 02:10:29,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_08-model_00-model_states.pt. 0: [2022-11-26 02:10:29,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_09-model_00-model_states.pt... 32: [2022-11-26 02:10:29,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_28-model_00-model_states.pt. 32: [2022-11-26 02:10:29,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_29-model_00-model_states.pt... 0: [2022-11-26 02:10:29,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_09-model_00-model_states.pt. 0: [2022-11-26 02:10:29,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_10-model_00-model_states.pt... 32: [2022-11-26 02:10:29,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_29-model_00-model_states.pt. 32: [2022-11-26 02:10:29,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_30-model_00-model_states.pt... 0: [2022-11-26 02:10:29,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_10-model_00-model_states.pt. 0: [2022-11-26 02:10:29,722] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_11-model_00-model_states.pt... 32: [2022-11-26 02:10:29,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_30-model_00-model_states.pt. 32: [2022-11-26 02:10:29,872] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_31-model_00-model_states.pt... 0: [2022-11-26 02:10:29,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_11-model_00-model_states.pt. 0: [2022-11-26 02:10:29,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_12-model_00-model_states.pt... 32: [2022-11-26 02:10:30,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_31-model_00-model_states.pt. 32: [2022-11-26 02:10:30,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_32-model_00-model_states.pt... 0: [2022-11-26 02:10:30,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_12-model_00-model_states.pt. 0: [2022-11-26 02:10:30,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_13-model_00-model_states.pt... 32: [2022-11-26 02:10:30,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_32-model_00-model_states.pt. 32: [2022-11-26 02:10:30,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_33-model_00-model_states.pt... 0: [2022-11-26 02:10:30,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_13-model_00-model_states.pt. 0: [2022-11-26 02:10:30,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_14-model_00-model_states.pt... 32: [2022-11-26 02:10:30,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_33-model_00-model_states.pt. 32: [2022-11-26 02:10:30,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_34-model_00-model_states.pt... 0: [2022-11-26 02:10:30,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_14-model_00-model_states.pt. 0: [2022-11-26 02:10:30,605] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_15-model_00-model_states.pt... 32: [2022-11-26 02:10:30,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_34-model_00-model_states.pt. 32: [2022-11-26 02:10:30,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_35-model_00-model_states.pt... 0: [2022-11-26 02:10:30,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_15-model_00-model_states.pt. 0: [2022-11-26 02:10:30,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_16-model_00-model_states.pt... 32: [2022-11-26 02:10:31,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_35-model_00-model_states.pt. 32: [2022-11-26 02:10:31,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_36-model_00-model_states.pt... 0: [2022-11-26 02:10:31,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_16-model_00-model_states.pt. 0: [2022-11-26 02:10:31,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_17-model_00-model_states.pt... 32: [2022-11-26 02:10:31,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_36-model_00-model_states.pt. 32: [2022-11-26 02:10:31,225] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_37-model_00-model_states.pt... 0: [2022-11-26 02:10:31,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_17-model_00-model_states.pt. 0: [2022-11-26 02:10:31,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_18-model_00-model_states.pt... 32: [2022-11-26 02:10:31,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_37-model_00-model_states.pt. 32: [2022-11-26 02:10:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_38-model_00-model_states.pt... 0: [2022-11-26 02:10:31,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_18-model_00-model_states.pt. 0: [2022-11-26 02:10:31,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_19-model_00-model_states.pt... 32: [2022-11-26 02:10:31,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_38-model_00-model_states.pt. 32: [2022-11-26 02:10:31,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_40-model_00-model_states.pt... 32: [2022-11-26 02:10:31,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_40-model_00-model_states.pt. 32: [2022-11-26 02:10:31,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/mp_rank_01_model_states.pt... 32: [2022-11-26 02:10:31,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/mp_rank_01_model_states.pt. 0: [2022-11-26 02:10:31,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_19-model_00-model_states.pt. 0: [2022-11-26 02:10:31,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/layer_20-model_00-model_states.pt... 0: [2022-11-26 02:10:31,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/layer_20-model_00-model_states.pt. 0: [2022-11-26 02:10:31,919] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step13000/mp_rank_00_model_states.pt 0: [2022-11-26 02:10:31,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/mp_rank_00_model_states.pt... 0: [2022-11-26 02:10:31,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/mp_rank_00_model_states.pt. 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:10:32,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 54: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 35: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 34: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:10:32,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:10:32,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 13: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 39: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 13: [2022-11-26 02:10:32,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 39: [2022-11-26 02:10:32,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 58: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 47: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 02:10:32,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 40: [2022-11-26 02:10:32,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 21: [2022-11-26 02:10:32,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 40: [2022-11-26 02:10:32,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 21: [2022-11-26 02:10:32,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 40: [2022-11-26 02:10:32,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:10:32,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:10:32,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 02:10:32,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:10:32,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 2: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 37: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 26: [2022-11-26 02:10:32,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 45: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 26: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 45: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 26: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 45: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 47: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:10:32,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:10:32,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 40: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:10:32,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 5: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 43: [2022-11-26 02:10:32,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 5: [2022-11-26 02:10:32,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 43: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:10:32,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 57: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 23: [2022-11-26 02:10:32,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:10:32,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 02:10:32,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 02:10:32,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:10:32,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:10:32,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:10:32,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:10:32,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 49: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 47: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 0: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 47: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 5: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 52: [2022-11-26 02:10:32,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 5: [2022-11-26 02:10:32,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 45: [2022-11-26 02:10:32,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 40: [2022-11-26 02:10:32,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 24: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 24: [2022-11-26 02:10:32,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 24: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:10:32,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 56: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 12: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:10:32,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 17: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 7: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 7: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 7: [2022-11-26 02:10:32,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 02:10:32,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 02:10:32,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 47: [2022-11-26 02:10:32,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 8: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 47: [2022-11-26 02:10:32,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 8: [2022-11-26 02:10:32,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 47: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 42: [2022-11-26 02:10:32,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:10:32,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 3: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 02:10:32,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 02:10:32,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 02:10:32,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 02:10:32,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:10:32,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:10:32,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 02:10:32,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 02:10:32,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 3: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 63: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 39: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 3: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:10:32,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 22: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 45: [2022-11-26 02:10:32,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 9: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 45: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 22: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:10:32,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 15: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:10:32,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:10:32,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 02:10:32,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 30: [2022-11-26 02:10:32,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 15: [2022-11-26 02:10:32,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 24: [2022-11-26 02:10:32,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,257] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 17: [2022-11-26 02:10:32,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 34: [2022-11-26 02:10:32,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 17: [2022-11-26 02:10:32,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 34: [2022-11-26 02:10:32,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 02:10:32,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:10:32,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 42: [2022-11-26 02:10:32,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 42: [2022-11-26 02:10:32,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 02:10:32,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:10:32,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 02:10:32,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:10:32,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 02:10:32,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:10:32,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 02:10:32,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 02:10:32,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:10:32,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 7: [2022-11-26 02:10:32,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 40: [2022-11-26 02:10:32,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 16: [2022-11-26 02:10:32,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 40: [2022-11-26 02:10:32,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 16: [2022-11-26 02:10:32,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 40: [2022-11-26 02:10:32,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:10:32,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 02:10:32,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:10:32,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 02:10:32,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 42: [2022-11-26 02:10:32,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 47: [2022-11-26 02:10:32,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 02:10:32,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 02:10:32,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:10:32,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 13: [2022-11-26 02:10:32,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:10:32,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 15: [2022-11-26 02:10:32,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 02:10:32,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:10:32,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:10:32,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:10:32,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:10:32,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 02:10:32,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 02:10:32,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 02:10:32,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 55: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 24: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 55: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 24: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 32: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 45: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 39: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 3: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 02:10:32,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 02:10:32,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:10:32,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:10:32,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:10:32,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:10:32,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 02:10:32,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 02:10:32,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 02:10:32,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 7: [2022-11-26 02:10:32,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 36: [2022-11-26 02:10:32,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:10:32,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 27: [2022-11-26 02:10:32,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 36: [2022-11-26 02:10:32,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 42: [2022-11-26 02:10:32,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:10:32,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 41: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 13: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 02:10:32,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 15: [2022-11-26 02:10:32,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 26: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 52: [2022-11-26 02:10:32,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 37: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 0: [2022-11-26 02:10:32,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 14: [2022-11-26 02:10:32,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 37: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:10:32,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 32: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 22: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 02:10:32,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:10:32,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:10:32,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:10:32,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 02:10:32,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 47: [2022-11-26 02:10:32,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 02:10:32,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 45: [2022-11-26 02:10:32,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 31: [2022-11-26 02:10:32,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 02:10:32,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:10:32,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:10:32,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 02:10:32,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:10:32,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 02:10:32,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:10:32,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 25: [2022-11-26 02:10:32,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 51: [2022-11-26 02:10:32,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 02:10:32,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 02:10:32,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 40: [2022-11-26 02:10:32,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 02:10:32,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 24: [2022-11-26 02:10:32,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 02:10:32,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:10:32,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:10:32,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:10:32,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 7: [2022-11-26 02:10:32,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:10:32,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 02:10:32,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 02:10:32,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 47: [2022-11-26 02:10:32,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 02:10:32,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 02:10:32,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:10:32,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 13: [2022-11-26 02:10:32,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:10:32,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:10:32,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:10:32,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 15: [2022-11-26 02:10:32,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 02:10:32,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 02:10:32,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:10:32,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:10:32,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:10:32,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 24: [2022-11-26 02:10:32,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:10:32,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 39: [2022-11-26 02:10:32,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 17: [2022-11-26 02:10:32,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 39: [2022-11-26 02:10:32,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 17: [2022-11-26 02:10:32,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 02:10:32,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:10:32,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 02:10:32,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 31: [2022-11-26 02:10:32,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 51: [2022-11-26 02:10:32,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 02:10:32,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 23: [2022-11-26 02:10:32,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:10:32,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:10:32,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 02:10:32,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:10:32,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 45: [2022-11-26 02:10:32,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:10:32,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 35: [2022-11-26 02:10:32,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 40: [2022-11-26 02:10:32,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 02:10:32,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 7: [2022-11-26 02:10:32,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 57: [2022-11-26 02:10:32,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:10:32,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:10:32,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:10:32,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 02:10:32,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 27: [2022-11-26 02:10:32,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:10:32,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 02:10:32,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 37: [2022-11-26 02:10:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:10:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 37: [2022-11-26 02:10:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 42: [2022-11-26 02:10:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:10:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 30: [2022-11-26 02:10:32,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:10:32,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 02:10:32,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 6: [2022-11-26 02:10:32,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:10:32,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 02:10:32,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:10:32,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 02:10:32,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 19: [2022-11-26 02:10:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:10:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 02:10:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:10:32,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 52: [2022-11-26 02:10:32,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:10:32,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 02:10:32,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 47: [2022-11-26 02:10:32,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:10:32,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 02:10:32,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 9: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:10:32,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:10:32,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 53: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:10:32,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 13: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:10:32,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 02:10:32,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:10:32,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:10:32,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 12: [2022-11-26 02:10:32,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 26: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:10:32,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 43: [2022-11-26 02:10:32,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 10: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:10:32,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 61: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 10: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 61: [2022-11-26 02:10:32,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 3: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:10:32,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 2: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:10:32,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 3: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 2: [2022-11-26 02:10:32,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 40: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 16: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 32: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 17: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 32: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 21: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 32: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 17: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 21: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 60: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 17: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 21: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 60: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:10:32,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 02:10:32,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 50: [2022-11-26 02:10:32,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 14: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:10:32,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:10:32,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 1: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:10:32,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 51: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 36: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 39: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 57: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 20: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 31: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 57: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 28: [2022-11-26 02:10:32,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 02:10:32,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 8: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 41: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 11: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 7: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 41: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 7: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 42: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 11: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 34: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 15: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 42: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 15: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 42: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 15: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 45: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 15: [2022-11-26 02:10:32,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:10:32,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 62: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 4: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 62: [2022-11-26 02:10:32,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 23: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 62: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 5: [2022-11-26 02:10:32,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 23: [2022-11-26 02:10:32,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 5: [2022-11-26 02:10:32,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 23: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 58: [2022-11-26 02:10:32,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 49: [2022-11-26 02:10:32,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 33: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:10:32,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 02:10:32,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 44: [2022-11-26 02:10:32,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:10:32,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 02:10:32,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 56: [2022-11-26 02:10:32,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:10:32,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:10:32,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 55: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:10:32,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 63: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:10:32,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 18: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:10:32,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 54: [2022-11-26 02:10:32,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 02:10:32,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 02:10:32,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 20: [2022-11-26 02:10:32,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:10:32,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 02:10:32,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 41: [2022-11-26 02:10:32,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:10:32,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 29: [2022-11-26 02:10:32,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 41: [2022-11-26 02:10:32,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 29: [2022-11-26 02:10:32,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 02:10:32,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 38: [2022-11-26 02:10:32,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:10:32,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 02:10:32,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 59: [2022-11-26 02:10:32,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 22: [2022-11-26 02:10:32,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:10:32,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 59: [2022-11-26 02:10:32,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 02:10:32,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 22: [2022-11-26 02:10:32,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 4: [2022-11-26 02:10:32,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:10:32,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 02:10:32,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 25: [2022-11-26 02:10:32,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:10:32,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 02:10:32,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 48: [2022-11-26 02:10:32,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:10:32,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 02:10:32,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 46: [2022-11-26 02:10:32,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:10:32,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step13000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 02:10:32,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! 0: successfully saved checkpoint at iteration 13000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5136.15 63: iteration 13010/ 24424 | consumed samples: 6661120 | consumed tokens: 13641973760 | elapsed time per iteration (s): 2.82 | learning rate: 1.021E-04 | global batch size: 512 | lm loss: 2.078427E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.822 | TFLOPs: 18.72 | 63: iteration 13020/ 24424 | consumed samples: 6666240 | consumed tokens: 13652459520 | elapsed time per iteration (s): 2.23 | learning rate: 1.020E-04 | global batch size: 512 | lm loss: 2.090454E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.037 | TFLOPs: 23.68 | 63: iteration 13030/ 24424 | consumed samples: 6671360 | consumed tokens: 13662945280 | elapsed time per iteration (s): 2.26 | learning rate: 1.019E-04 | global batch size: 512 | lm loss: 2.097919E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.668 | TFLOPs: 23.33 | 63: iteration 13040/ 24424 | consumed samples: 6676480 | consumed tokens: 13673431040 | elapsed time per iteration (s): 2.28 | learning rate: 1.018E-04 | global batch size: 512 | lm loss: 2.080682E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.363 | TFLOPs: 23.10 | 63: iteration 13050/ 24424 | consumed samples: 6681600 | consumed tokens: 13683916800 | elapsed time per iteration (s): 2.24 | learning rate: 1.016E-04 | global batch size: 512 | lm loss: 2.076693E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.069 | TFLOPs: 23.58 | 63: iteration 13060/ 24424 | consumed samples: 6686720 | consumed tokens: 13694402560 | elapsed time per iteration (s): 2.23 | learning rate: 1.015E-04 | global batch size: 512 | lm loss: 2.083106E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.865 | TFLOPs: 23.66 | 63: iteration 13070/ 24424 | consumed samples: 6691840 | consumed tokens: 13704888320 | elapsed time per iteration (s): 2.23 | learning rate: 1.014E-04 | global batch size: 512 | lm loss: 2.075080E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.500 | TFLOPs: 23.63 | 63: iteration 13080/ 24424 | consumed samples: 6696960 | consumed tokens: 13715374080 | elapsed time per iteration (s): 2.23 | learning rate: 1.013E-04 | global batch size: 512 | lm loss: 2.057116E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.082 | TFLOPs: 23.69 | 63: iteration 13090/ 24424 | consumed samples: 6702080 | consumed tokens: 13725859840 | elapsed time per iteration (s): 2.69 | learning rate: 1.012E-04 | global batch size: 512 | lm loss: 2.076114E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.266 | TFLOPs: 19.59 | 63: iteration 13100/ 24424 | consumed samples: 6707200 | consumed tokens: 13736345600 | elapsed time per iteration (s): 2.31 | learning rate: 1.011E-04 | global batch size: 512 | lm loss: 2.083126E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.097 | TFLOPs: 22.86 | 63: iteration 13110/ 24424 | consumed samples: 6712320 | consumed tokens: 13746831360 | elapsed time per iteration (s): 2.23 | learning rate: 1.009E-04 | global batch size: 512 | lm loss: 2.095682E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.458 | TFLOPs: 23.62 | 63: iteration 13120/ 24424 | consumed samples: 6717440 | consumed tokens: 13757317120 | elapsed time per iteration (s): 2.27 | learning rate: 1.008E-04 | global batch size: 512 | lm loss: 2.067592E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.674 | TFLOPs: 23.23 | 63: iteration 13130/ 24424 | consumed samples: 6722560 | consumed tokens: 13767802880 | elapsed time per iteration (s): 2.25 | learning rate: 1.007E-04 | global batch size: 512 | lm loss: 2.068004E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.002 | TFLOPs: 23.47 | 63: iteration 13140/ 24424 | consumed samples: 6727680 | consumed tokens: 13778288640 | elapsed time per iteration (s): 2.23 | learning rate: 1.006E-04 | global batch size: 512 | lm loss: 2.075913E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.761 | TFLOPs: 23.65 | 63: iteration 13150/ 24424 | consumed samples: 6732800 | consumed tokens: 13788774400 | elapsed time per iteration (s): 2.24 | learning rate: 1.005E-04 | global batch size: 512 | lm loss: 2.089996E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.568 | TFLOPs: 23.53 | 63: iteration 13160/ 24424 | consumed samples: 6737920 | consumed tokens: 13799260160 | elapsed time per iteration (s): 2.27 | learning rate: 1.004E-04 | global batch size: 512 | lm loss: 2.088675E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.786 | TFLOPs: 23.24 | 63: iteration 13170/ 24424 | consumed samples: 6743040 | consumed tokens: 13809745920 | elapsed time per iteration (s): 2.25 | learning rate: 1.002E-04 | global batch size: 512 | lm loss: 2.104798E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.676 | TFLOPs: 23.44 | 63: iteration 13180/ 24424 | consumed samples: 6748160 | consumed tokens: 13820231680 | elapsed time per iteration (s): 2.25 | learning rate: 1.001E-04 | global batch size: 512 | lm loss: 2.063709E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.243 | TFLOPs: 23.39 | 63: iteration 13190/ 24424 | consumed samples: 6753280 | consumed tokens: 13830717440 | elapsed time per iteration (s): 2.24 | learning rate: 1.000E-04 | global batch size: 512 | lm loss: 2.091560E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.256 | TFLOPs: 23.50 | 63: iteration 13200/ 24424 | consumed samples: 6758400 | consumed tokens: 13841203200 | elapsed time per iteration (s): 2.26 | learning rate: 9.990E-05 | global batch size: 512 | lm loss: 2.086686E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.600 | TFLOPs: 23.33 | 63: iteration 13210/ 24424 | consumed samples: 6763520 | consumed tokens: 13851688960 | elapsed time per iteration (s): 2.23 | learning rate: 9.978E-05 | global batch size: 512 | lm loss: 2.085510E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.592 | TFLOPs: 23.64 | 63: iteration 13220/ 24424 | consumed samples: 6768640 | consumed tokens: 13862174720 | elapsed time per iteration (s): 2.28 | learning rate: 9.967E-05 | global batch size: 512 | lm loss: 2.083835E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.463 | TFLOPs: 23.11 | 63: iteration 13230/ 24424 | consumed samples: 6773760 | consumed tokens: 13872660480 | elapsed time per iteration (s): 2.27 | learning rate: 9.955E-05 | global batch size: 512 | lm loss: 2.075706E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.102 | TFLOPs: 23.17 | 63: iteration 13240/ 24424 | consumed samples: 6778880 | consumed tokens: 13883146240 | elapsed time per iteration (s): 2.23 | learning rate: 9.944E-05 | global batch size: 512 | lm loss: 2.066454E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.131 | TFLOPs: 23.59 | 63: iteration 13250/ 24424 | consumed samples: 6784000 | consumed tokens: 13893632000 | elapsed time per iteration (s): 2.35 | learning rate: 9.932E-05 | global batch size: 512 | lm loss: 2.072151E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.270 | TFLOPs: 22.47 | 63: iteration 13260/ 24424 | consumed samples: 6789120 | consumed tokens: 13904117760 | elapsed time per iteration (s): 2.27 | learning rate: 9.920E-05 | global batch size: 512 | lm loss: 2.084419E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.511 | TFLOPs: 23.22 | 63: iteration 13270/ 24424 | consumed samples: 6794240 | consumed tokens: 13914603520 | elapsed time per iteration (s): 4.02 | learning rate: 9.909E-05 | global batch size: 512 | lm loss: 2.073524E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 127.412 | TFLOPs: 13.12 | 63: iteration 13280/ 24424 | consumed samples: 6799360 | consumed tokens: 13925089280 | elapsed time per iteration (s): 2.24 | learning rate: 9.897E-05 | global batch size: 512 | lm loss: 2.085114E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.150 | TFLOPs: 23.49 | 63: iteration 13290/ 24424 | consumed samples: 6804480 | consumed tokens: 13935575040 | elapsed time per iteration (s): 2.26 | learning rate: 9.886E-05 | global batch size: 512 | lm loss: 2.064567E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.642 | TFLOPs: 23.33 | 63: iteration 13300/ 24424 | consumed samples: 6809600 | consumed tokens: 13946060800 | elapsed time per iteration (s): 2.25 | learning rate: 9.874E-05 | global batch size: 512 | lm loss: 2.068367E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.668 | TFLOPs: 23.44 | 63: iteration 13310/ 24424 | consumed samples: 6814720 | consumed tokens: 13956546560 | elapsed time per iteration (s): 2.23 | learning rate: 9.862E-05 | global batch size: 512 | lm loss: 2.076694E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.237 | TFLOPs: 23.60 | 63: iteration 13320/ 24424 | consumed samples: 6819840 | consumed tokens: 13967032320 | elapsed time per iteration (s): 2.25 | learning rate: 9.851E-05 | global batch size: 512 | lm loss: 2.081413E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.894 | TFLOPs: 23.46 | 63: iteration 13330/ 24424 | consumed samples: 6824960 | consumed tokens: 13977518080 | elapsed time per iteration (s): 2.25 | learning rate: 9.839E-05 | global batch size: 512 | lm loss: 2.060724E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.026 | TFLOPs: 23.47 | 63: iteration 13340/ 24424 | consumed samples: 6830080 | consumed tokens: 13988003840 | elapsed time per iteration (s): 2.28 | learning rate: 9.828E-05 | global batch size: 512 | lm loss: 2.099382E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.219 | TFLOPs: 23.08 | 63: iteration 13350/ 24424 | consumed samples: 6835200 | consumed tokens: 13998489600 | elapsed time per iteration (s): 2.27 | learning rate: 9.816E-05 | global batch size: 512 | lm loss: 2.070191E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.744 | TFLOPs: 23.24 | 63: iteration 13360/ 24424 | consumed samples: 6840320 | consumed tokens: 14008975360 | elapsed time per iteration (s): 2.25 | learning rate: 9.804E-05 | global batch size: 512 | lm loss: 2.072421E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.006 | TFLOPs: 23.47 | 63: iteration 13370/ 24424 | consumed samples: 6845440 | consumed tokens: 14019461120 | elapsed time per iteration (s): 2.23 | learning rate: 9.793E-05 | global batch size: 512 | lm loss: 2.069452E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.258 | TFLOPs: 23.60 | 63: iteration 13380/ 24424 | consumed samples: 6850560 | consumed tokens: 14029946880 | elapsed time per iteration (s): 2.25 | learning rate: 9.781E-05 | global batch size: 512 | lm loss: 2.077020E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.604 | TFLOPs: 23.43 | 63: iteration 13390/ 24424 | consumed samples: 6855680 | consumed tokens: 14040432640 | elapsed time per iteration (s): 2.26 | learning rate: 9.770E-05 | global batch size: 512 | lm loss: 2.089705E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.005 | TFLOPs: 23.37 | 63: iteration 13400/ 24424 | consumed samples: 6860800 | consumed tokens: 14050918400 | elapsed time per iteration (s): 2.62 | learning rate: 9.758E-05 | global batch size: 512 | lm loss: 2.065884E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 195.503 | TFLOPs: 20.13 | 63: iteration 13410/ 24424 | consumed samples: 6865920 | consumed tokens: 14061404160 | elapsed time per iteration (s): 2.23 | learning rate: 9.747E-05 | global batch size: 512 | lm loss: 2.060689E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.695 | TFLOPs: 23.65 | 63: iteration 13420/ 24424 | consumed samples: 6871040 | consumed tokens: 14071889920 | elapsed time per iteration (s): 2.26 | learning rate: 9.735E-05 | global batch size: 512 | lm loss: 2.052662E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.018 | TFLOPs: 23.37 | 63: iteration 13430/ 24424 | consumed samples: 6876160 | consumed tokens: 14082375680 | elapsed time per iteration (s): 2.23 | learning rate: 9.723E-05 | global batch size: 512 | lm loss: 2.072800E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.911 | TFLOPs: 23.67 | 63: iteration 13440/ 24424 | consumed samples: 6881280 | consumed tokens: 14092861440 | elapsed time per iteration (s): 2.29 | learning rate: 9.712E-05 | global batch size: 512 | lm loss: 2.071949E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.793 | TFLOPs: 23.04 | 63: iteration 13450/ 24424 | consumed samples: 6886400 | consumed tokens: 14103347200 | elapsed time per iteration (s): 2.23 | learning rate: 9.700E-05 | global batch size: 512 | lm loss: 2.075070E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.121 | TFLOPs: 23.59 | 63: iteration 13460/ 24424 | consumed samples: 6891520 | consumed tokens: 14113832960 | elapsed time per iteration (s): 2.28 | learning rate: 9.689E-05 | global batch size: 512 | lm loss: 2.065921E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.946 | TFLOPs: 23.16 | 63: iteration 13470/ 24424 | consumed samples: 6896640 | consumed tokens: 14124318720 | elapsed time per iteration (s): 2.26 | learning rate: 9.677E-05 | global batch size: 512 | lm loss: 2.071084E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.838 | TFLOPs: 23.35 | 63: iteration 13480/ 24424 | consumed samples: 6901760 | consumed tokens: 14134804480 | elapsed time per iteration (s): 2.28 | learning rate: 9.666E-05 | global batch size: 512 | lm loss: 2.069763E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.709 | TFLOPs: 23.13 | 63: iteration 13490/ 24424 | consumed samples: 6906880 | consumed tokens: 14145290240 | elapsed time per iteration (s): 2.27 | learning rate: 9.654E-05 | global batch size: 512 | lm loss: 2.066751E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.590 | TFLOPs: 23.22 | 63: iteration 13500/ 24424 | consumed samples: 6912000 | consumed tokens: 14155776000 | elapsed time per iteration (s): 2.29 | learning rate: 9.642E-05 | global batch size: 512 | lm loss: 2.084609E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.250 | TFLOPs: 22.98 | 63: iteration 13510/ 24424 | consumed samples: 6917120 | consumed tokens: 14166261760 | elapsed time per iteration (s): 2.24 | learning rate: 9.631E-05 | global batch size: 512 | lm loss: 2.064874E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.986 | TFLOPs: 23.57 | 63: iteration 13520/ 24424 | consumed samples: 6922240 | consumed tokens: 14176747520 | elapsed time per iteration (s): 2.31 | learning rate: 9.619E-05 | global batch size: 512 | lm loss: 2.057603E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.982 | TFLOPs: 22.85 | 63: iteration 13530/ 24424 | consumed samples: 6927360 | consumed tokens: 14187233280 | elapsed time per iteration (s): 2.32 | learning rate: 9.608E-05 | global batch size: 512 | lm loss: 2.095793E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.881 | TFLOPs: 22.74 | 63: iteration 13540/ 24424 | consumed samples: 6932480 | consumed tokens: 14197719040 | elapsed time per iteration (s): 2.26 | learning rate: 9.596E-05 | global batch size: 512 | lm loss: 2.066022E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.833 | TFLOPs: 23.35 | 63: iteration 13550/ 24424 | consumed samples: 6937600 | consumed tokens: 14208204800 | elapsed time per iteration (s): 2.27 | learning rate: 9.585E-05 | global batch size: 512 | lm loss: 2.064323E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.688 | TFLOPs: 23.23 | 63: iteration 13560/ 24424 | consumed samples: 6942720 | consumed tokens: 14218690560 | elapsed time per iteration (s): 2.40 | learning rate: 9.573E-05 | global batch size: 512 | lm loss: 2.075310E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 213.450 | TFLOPs: 21.97 | 63: iteration 13570/ 24424 | consumed samples: 6947840 | consumed tokens: 14229176320 | elapsed time per iteration (s): 2.24 | learning rate: 9.562E-05 | global batch size: 512 | lm loss: 2.058847E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.211 | TFLOPs: 23.49 | 63: iteration 13580/ 24424 | consumed samples: 6952960 | consumed tokens: 14239662080 | elapsed time per iteration (s): 4.55 | learning rate: 9.550E-05 | global batch size: 512 | lm loss: 2.082449E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 112.531 | TFLOPs: 11.58 | 63: iteration 13590/ 24424 | consumed samples: 6958080 | consumed tokens: 14250147840 | elapsed time per iteration (s): 2.24 | learning rate: 9.538E-05 | global batch size: 512 | lm loss: 2.067031E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.827 | TFLOPs: 23.56 | 63: iteration 13600/ 24424 | consumed samples: 6963200 | consumed tokens: 14260633600 | elapsed time per iteration (s): 2.28 | learning rate: 9.527E-05 | global batch size: 512 | lm loss: 2.074958E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.409 | TFLOPs: 23.10 | 63: iteration 13610/ 24424 | consumed samples: 6968320 | consumed tokens: 14271119360 | elapsed time per iteration (s): 2.28 | learning rate: 9.515E-05 | global batch size: 512 | lm loss: 2.074551E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.611 | TFLOPs: 23.12 | 63: iteration 13620/ 24424 | consumed samples: 6973440 | consumed tokens: 14281605120 | elapsed time per iteration (s): 2.24 | learning rate: 9.504E-05 | global batch size: 512 | lm loss: 2.062635E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.560 | TFLOPs: 23.53 | 63: iteration 13630/ 24424 | consumed samples: 6978560 | consumed tokens: 14292090880 | elapsed time per iteration (s): 2.42 | learning rate: 9.492E-05 | global batch size: 512 | lm loss: 2.062037E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 211.994 | TFLOPs: 21.82 | 63: iteration 13640/ 24424 | consumed samples: 6983680 | consumed tokens: 14302576640 | elapsed time per iteration (s): 2.28 | learning rate: 9.481E-05 | global batch size: 512 | lm loss: 2.063077E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.594 | TFLOPs: 23.12 | 63: iteration 13650/ 24424 | consumed samples: 6988800 | consumed tokens: 14313062400 | elapsed time per iteration (s): 2.26 | learning rate: 9.469E-05 | global batch size: 512 | lm loss: 2.076294E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.809 | TFLOPs: 23.35 | 63: iteration 13660/ 24424 | consumed samples: 6993920 | consumed tokens: 14323548160 | elapsed time per iteration (s): 2.23 | learning rate: 9.458E-05 | global batch size: 512 | lm loss: 2.078291E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.811 | TFLOPs: 23.66 | 63: iteration 13670/ 24424 | consumed samples: 6999040 | consumed tokens: 14334033920 | elapsed time per iteration (s): 2.27 | learning rate: 9.446E-05 | global batch size: 512 | lm loss: 2.068169E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.704 | TFLOPs: 23.24 | 63: iteration 13680/ 24424 | consumed samples: 7004160 | consumed tokens: 14344519680 | elapsed time per iteration (s): 2.26 | learning rate: 9.435E-05 | global batch size: 512 | lm loss: 2.067167E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.322 | TFLOPs: 23.30 | 63: iteration 13690/ 24424 | consumed samples: 7009280 | consumed tokens: 14355005440 | elapsed time per iteration (s): 2.30 | learning rate: 9.423E-05 | global batch size: 512 | lm loss: 2.079169E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.037 | TFLOPs: 22.96 | 63: iteration 13700/ 24424 | consumed samples: 7014400 | consumed tokens: 14365491200 | elapsed time per iteration (s): 2.25 | learning rate: 9.412E-05 | global batch size: 512 | lm loss: 2.064819E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.489 | TFLOPs: 23.42 | 63: iteration 13710/ 24424 | consumed samples: 7019520 | consumed tokens: 14375976960 | elapsed time per iteration (s): 2.67 | learning rate: 9.400E-05 | global batch size: 512 | lm loss: 2.072345E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 191.947 | TFLOPs: 19.76 | 63: iteration 13720/ 24424 | consumed samples: 7024640 | consumed tokens: 14386462720 | elapsed time per iteration (s): 2.23 | learning rate: 9.389E-05 | global batch size: 512 | lm loss: 2.077640E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.405 | TFLOPs: 23.62 | 63: iteration 13730/ 24424 | consumed samples: 7029760 | consumed tokens: 14396948480 | elapsed time per iteration (s): 2.29 | learning rate: 9.377E-05 | global batch size: 512 | lm loss: 2.077067E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.097 | TFLOPs: 22.97 | 63: iteration 13740/ 24424 | consumed samples: 7034880 | consumed tokens: 14407434240 | elapsed time per iteration (s): 2.25 | learning rate: 9.366E-05 | global batch size: 512 | lm loss: 2.067317E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.830 | TFLOPs: 23.45 | 63: iteration 13750/ 24424 | consumed samples: 7040000 | consumed tokens: 14417920000 | elapsed time per iteration (s): 2.23 | learning rate: 9.354E-05 | global batch size: 512 | lm loss: 2.083092E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.144 | TFLOPs: 23.59 | 63: iteration 13760/ 24424 | consumed samples: 7045120 | consumed tokens: 14428405760 | elapsed time per iteration (s): 2.31 | learning rate: 9.343E-05 | global batch size: 512 | lm loss: 2.072071E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.774 | TFLOPs: 22.83 | 63: iteration 13770/ 24424 | consumed samples: 7050240 | consumed tokens: 14438891520 | elapsed time per iteration (s): 2.30 | learning rate: 9.331E-05 | global batch size: 512 | lm loss: 2.066665E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.358 | TFLOPs: 22.89 | 63: iteration 13780/ 24424 | consumed samples: 7055360 | consumed tokens: 14449377280 | elapsed time per iteration (s): 2.25 | learning rate: 9.320E-05 | global batch size: 512 | lm loss: 2.067574E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.559 | TFLOPs: 23.43 | 63: iteration 13790/ 24424 | consumed samples: 7060480 | consumed tokens: 14459863040 | elapsed time per iteration (s): 2.26 | learning rate: 9.308E-05 | global batch size: 512 | lm loss: 2.063947E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.038 | TFLOPs: 23.37 | 63: iteration 13800/ 24424 | consumed samples: 7065600 | consumed tokens: 14470348800 | elapsed time per iteration (s): 2.32 | learning rate: 9.297E-05 | global batch size: 512 | lm loss: 2.081023E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.106 | TFLOPs: 22.76 | 63: iteration 13810/ 24424 | consumed samples: 7070720 | consumed tokens: 14480834560 | elapsed time per iteration (s): 2.23 | learning rate: 9.285E-05 | global batch size: 512 | lm loss: 2.066579E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.550 | TFLOPs: 23.63 | 63: iteration 13820/ 24424 | consumed samples: 7075840 | consumed tokens: 14491320320 | elapsed time per iteration (s): 2.24 | learning rate: 9.274E-05 | global batch size: 512 | lm loss: 2.078020E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.247 | TFLOPs: 23.50 | 63: iteration 13830/ 24424 | consumed samples: 7080960 | consumed tokens: 14501806080 | elapsed time per iteration (s): 2.23 | learning rate: 9.262E-05 | global batch size: 512 | lm loss: 2.067010E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.744 | TFLOPs: 23.65 | 63: iteration 13840/ 24424 | consumed samples: 7086080 | consumed tokens: 14512291840 | elapsed time per iteration (s): 2.26 | learning rate: 9.251E-05 | global batch size: 512 | lm loss: 2.080283E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.144 | TFLOPs: 23.28 | 63: iteration 13850/ 24424 | consumed samples: 7091200 | consumed tokens: 14522777600 | elapsed time per iteration (s): 2.24 | learning rate: 9.239E-05 | global batch size: 512 | lm loss: 2.072165E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.389 | TFLOPs: 23.51 | 63: iteration 13860/ 24424 | consumed samples: 7096320 | consumed tokens: 14533263360 | elapsed time per iteration (s): 2.61 | learning rate: 9.228E-05 | global batch size: 512 | lm loss: 2.091461E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 196.053 | TFLOPs: 20.18 | 63: iteration 13870/ 24424 | consumed samples: 7101440 | consumed tokens: 14543749120 | elapsed time per iteration (s): 2.26 | learning rate: 9.216E-05 | global batch size: 512 | lm loss: 2.071276E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.087 | TFLOPs: 23.27 | 63: iteration 13880/ 24424 | consumed samples: 7106560 | consumed tokens: 14554234880 | elapsed time per iteration (s): 2.28 | learning rate: 9.205E-05 | global batch size: 512 | lm loss: 2.080951E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.821 | TFLOPs: 23.14 | 63: iteration 13890/ 24424 | consumed samples: 7111680 | consumed tokens: 14564720640 | elapsed time per iteration (s): 2.24 | learning rate: 9.194E-05 | global batch size: 512 | lm loss: 2.074775E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.932 | TFLOPs: 23.57 | 63: iteration 13900/ 24424 | consumed samples: 7116800 | consumed tokens: 14575206400 | elapsed time per iteration (s): 2.25 | learning rate: 9.182E-05 | global batch size: 512 | lm loss: 2.059639E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.299 | TFLOPs: 23.40 | 63: iteration 13910/ 24424 | consumed samples: 7121920 | consumed tokens: 14585692160 | elapsed time per iteration (s): 2.27 | learning rate: 9.171E-05 | global batch size: 512 | lm loss: 2.056560E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.859 | TFLOPs: 23.25 | 63: iteration 13920/ 24424 | consumed samples: 7127040 | consumed tokens: 14596177920 | elapsed time per iteration (s): 2.23 | learning rate: 9.159E-05 | global batch size: 512 | lm loss: 2.080236E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.229 | TFLOPs: 23.60 | 63: iteration 13930/ 24424 | consumed samples: 7132160 | consumed tokens: 14606663680 | elapsed time per iteration (s): 2.23 | learning rate: 9.148E-05 | global batch size: 512 | lm loss: 2.090213E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.112 | TFLOPs: 23.59 | 63: iteration 13940/ 24424 | consumed samples: 7137280 | consumed tokens: 14617149440 | elapsed time per iteration (s): 2.25 | learning rate: 9.136E-05 | global batch size: 512 | lm loss: 2.072930E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.910 | TFLOPs: 23.46 | 63: iteration 13950/ 24424 | consumed samples: 7142400 | consumed tokens: 14627635200 | elapsed time per iteration (s): 2.26 | learning rate: 9.125E-05 | global batch size: 512 | lm loss: 2.036506E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.968 | TFLOPs: 23.37 | 63: iteration 13960/ 24424 | consumed samples: 7147520 | consumed tokens: 14638120960 | elapsed time per iteration (s): 2.27 | learning rate: 9.113E-05 | global batch size: 512 | lm loss: 2.055464E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.807 | TFLOPs: 23.25 | 63: iteration 13970/ 24424 | consumed samples: 7152640 | consumed tokens: 14648606720 | elapsed time per iteration (s): 2.26 | learning rate: 9.102E-05 | global batch size: 512 | lm loss: 2.085586E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.786 | TFLOPs: 23.35 | 63: iteration 13980/ 24424 | consumed samples: 7157760 | consumed tokens: 14659092480 | elapsed time per iteration (s): 2.23 | learning rate: 9.091E-05 | global batch size: 512 | lm loss: 2.066024E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.185 | TFLOPs: 23.59 | 63: iteration 13990/ 24424 | consumed samples: 7162880 | consumed tokens: 14669578240 | elapsed time per iteration (s): 2.23 | learning rate: 9.079E-05 | global batch size: 512 | lm loss: 2.069720E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.427 | TFLOPs: 23.62 | 0: [2022-11-26 02:49:08,864] [INFO] [logging.py:68:log_dist] [Rank 0] step=14000, skipped=0, lr=[9.06770213101898e-05, 9.06770213101898e-05, 9.06770213101898e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 14000/ 24424 | consumed samples: 7168000 | consumed tokens: 14680064000 | elapsed time per iteration (s): 2.28 | learning rate: 9.068E-05 | global batch size: 512 | lm loss: 2.062745E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.553 | TFLOPs: 23.12 | 0: steps: 14000 loss: 2.1291 iter time (s): 2.484 samples/sec: 206.098 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 14000 | lm loss value: 2.060910E+00 | lm loss PPL: 7.853117E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 14000 to checkpoints_3b9 0: [2022-11-26 02:49:09,632] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is begin to save! 0: [2022-11-26 02:49:09,650] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_01-model_00-model_states.pt... 32: [2022-11-26 02:49:09,650] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_21-model_00-model_states.pt... 32: [2022-11-26 02:49:09,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_21-model_00-model_states.pt. 32: [2022-11-26 02:49:09,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_22-model_00-model_states.pt... 0: [2022-11-26 02:49:10,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_01-model_00-model_states.pt. 0: [2022-11-26 02:49:10,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_03-model_00-model_states.pt... 32: [2022-11-26 02:49:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_22-model_00-model_states.pt. 32: [2022-11-26 02:49:10,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_23-model_00-model_states.pt... 0: [2022-11-26 02:49:10,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_03-model_00-model_states.pt. 0: [2022-11-26 02:49:10,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_04-model_00-model_states.pt... 32: [2022-11-26 02:49:10,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_23-model_00-model_states.pt. 32: [2022-11-26 02:49:10,374] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_24-model_00-model_states.pt... 0: [2022-11-26 02:49:10,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_04-model_00-model_states.pt. 0: [2022-11-26 02:49:10,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_05-model_00-model_states.pt... 32: [2022-11-26 02:49:10,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_24-model_00-model_states.pt. 32: [2022-11-26 02:49:10,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_25-model_00-model_states.pt... 0: [2022-11-26 02:49:10,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_05-model_00-model_states.pt. 0: [2022-11-26 02:49:10,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_06-model_00-model_states.pt... 32: [2022-11-26 02:49:10,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_25-model_00-model_states.pt. 32: [2022-11-26 02:49:10,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_26-model_00-model_states.pt... 0: [2022-11-26 02:49:11,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_06-model_00-model_states.pt. 0: [2022-11-26 02:49:11,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_07-model_00-model_states.pt... 32: [2022-11-26 02:49:11,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_26-model_00-model_states.pt. 32: [2022-11-26 02:49:11,074] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_27-model_00-model_states.pt... 0: [2022-11-26 02:49:11,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_07-model_00-model_states.pt. 0: [2022-11-26 02:49:11,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_08-model_00-model_states.pt... 32: [2022-11-26 02:49:11,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_27-model_00-model_states.pt. 32: [2022-11-26 02:49:11,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_28-model_00-model_states.pt... 0: [2022-11-26 02:49:11,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_08-model_00-model_states.pt. 0: [2022-11-26 02:49:11,484] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_09-model_00-model_states.pt... 32: [2022-11-26 02:49:11,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_28-model_00-model_states.pt. 32: [2022-11-26 02:49:11,550] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_29-model_00-model_states.pt... 0: [2022-11-26 02:49:11,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_09-model_00-model_states.pt. 0: [2022-11-26 02:49:11,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_10-model_00-model_states.pt... 32: [2022-11-26 02:49:11,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_29-model_00-model_states.pt. 32: [2022-11-26 02:49:11,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_30-model_00-model_states.pt... 0: [2022-11-26 02:49:11,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_10-model_00-model_states.pt. 0: [2022-11-26 02:49:11,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_11-model_00-model_states.pt... 32: [2022-11-26 02:49:12,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_30-model_00-model_states.pt. 32: [2022-11-26 02:49:12,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_31-model_00-model_states.pt... 0: [2022-11-26 02:49:12,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_11-model_00-model_states.pt. 0: [2022-11-26 02:49:12,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_12-model_00-model_states.pt... 32: [2022-11-26 02:49:12,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_31-model_00-model_states.pt. 32: [2022-11-26 02:49:12,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_32-model_00-model_states.pt... 0: [2022-11-26 02:49:12,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_12-model_00-model_states.pt. 0: [2022-11-26 02:49:12,437] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_13-model_00-model_states.pt... 32: [2022-11-26 02:49:12,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_32-model_00-model_states.pt. 32: [2022-11-26 02:49:12,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_33-model_00-model_states.pt... 0: [2022-11-26 02:49:12,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_13-model_00-model_states.pt. 0: [2022-11-26 02:49:12,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_14-model_00-model_states.pt... 32: [2022-11-26 02:49:12,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_33-model_00-model_states.pt. 32: [2022-11-26 02:49:12,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_34-model_00-model_states.pt... 0: [2022-11-26 02:49:12,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_14-model_00-model_states.pt. 0: [2022-11-26 02:49:12,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_15-model_00-model_states.pt... 32: [2022-11-26 02:49:12,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_34-model_00-model_states.pt. 32: [2022-11-26 02:49:12,927] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_35-model_00-model_states.pt... 0: [2022-11-26 02:49:13,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_15-model_00-model_states.pt. 0: [2022-11-26 02:49:13,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_16-model_00-model_states.pt... 32: [2022-11-26 02:49:13,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_35-model_00-model_states.pt. 32: [2022-11-26 02:49:13,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_36-model_00-model_states.pt... 0: [2022-11-26 02:49:13,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_16-model_00-model_states.pt. 0: [2022-11-26 02:49:13,378] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_17-model_00-model_states.pt... 32: [2022-11-26 02:49:13,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_36-model_00-model_states.pt. 32: [2022-11-26 02:49:13,380] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_37-model_00-model_states.pt... 32: [2022-11-26 02:49:13,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_37-model_00-model_states.pt. 32: [2022-11-26 02:49:13,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_38-model_00-model_states.pt... 0: [2022-11-26 02:49:13,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_17-model_00-model_states.pt. 0: [2022-11-26 02:49:13,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_18-model_00-model_states.pt... 32: [2022-11-26 02:49:13,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_38-model_00-model_states.pt. 32: [2022-11-26 02:49:13,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_40-model_00-model_states.pt... 0: [2022-11-26 02:49:13,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_18-model_00-model_states.pt. 0: [2022-11-26 02:49:13,839] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_19-model_00-model_states.pt... 32: [2022-11-26 02:49:13,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_40-model_00-model_states.pt. 32: [2022-11-26 02:49:13,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/mp_rank_01_model_states.pt... 32: [2022-11-26 02:49:13,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/mp_rank_01_model_states.pt. 0: [2022-11-26 02:49:14,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_19-model_00-model_states.pt. 0: [2022-11-26 02:49:14,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/layer_20-model_00-model_states.pt... 0: [2022-11-26 02:49:14,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/layer_20-model_00-model_states.pt. 0: [2022-11-26 02:49:14,297] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step14000/mp_rank_00_model_states.pt 0: [2022-11-26 02:49:14,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/mp_rank_00_model_states.pt... 0: [2022-11-26 02:49:14,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/mp_rank_00_model_states.pt. 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 52: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 53: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 63: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 52: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 62: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 44: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 47: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 38: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 39: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 61: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 16: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 51: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 19: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 2: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 24: [2022-11-26 02:49:14,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 20: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 6: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 4: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 18: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 0: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 27: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 5: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 3: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 02:49:14,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step14000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 32: [2022-11-26 02:49:14,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:49:14,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:49:14,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 0: [2022-11-26 02:49:14,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 50: [2022-11-26 02:49:14,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 02:49:14,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:49:14,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 02:49:14,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 31: [2022-11-26 02:49:14,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:49:14,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:49:14,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:49:14,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 32: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 22: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 61: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 35: [2022-11-26 02:49:14,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 22: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:49:14,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:49:14,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:49:14,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:49:14,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:49:14,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:49:14,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 45: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 41: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 31: [2022-11-26 02:49:14,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 32: [2022-11-26 02:49:14,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 6: [2022-11-26 02:49:14,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 32: [2022-11-26 02:49:14,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 6: [2022-11-26 02:49:14,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 62: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:49:14,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 02:49:14,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:49:14,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:49:14,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:49:14,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:49:14,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 02:49:14,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:49:14,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:49:14,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 62: [2022-11-26 02:49:14,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 23: [2022-11-26 02:49:14,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 62: [2022-11-26 02:49:14,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:49:14,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:49:14,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 41: [2022-11-26 02:49:14,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 7: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 35: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 49: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 18: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 35: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 18: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 35: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 22: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 44: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 11: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 44: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 11: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:49:14,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 23: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 23: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 17: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:49:14,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 42: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 58: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 16: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 54: [2022-11-26 02:49:14,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 02:49:14,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:49:14,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 22: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 41: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 14: [2022-11-26 02:49:14,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 41: [2022-11-26 02:49:14,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 14: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 27: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 42: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 27: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 42: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 31: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 23: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:49:14,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 46: [2022-11-26 02:49:14,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 23: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:49:14,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:49:14,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 02:49:14,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 62: [2022-11-26 02:49:14,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 12: [2022-11-26 02:49:14,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 62: [2022-11-26 02:49:14,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:49:14,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:49:14,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:49:14,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 02:49:14,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 02:49:14,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:49:14,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 02:49:14,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:49:14,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:49:14,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:49:14,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 61: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 0: [2022-11-26 02:49:14,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 61: [2022-11-26 02:49:14,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 0: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 62: [2022-11-26 02:49:14,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 02:49:14,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:49:14,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:49:14,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 02:49:14,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 02:49:14,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 02:49:14,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:49:14,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:49:14,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:49:14,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 02:49:14,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 02:49:14,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 02:49:14,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 02:49:14,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 02:49:14,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 02:49:14,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:49:14,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 02:49:14,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:49:14,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 02:49:14,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 50: [2022-11-26 02:49:14,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:49:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 02:49:14,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 02:49:14,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 02:49:14,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 02:49:14,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:49:14,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 02:49:14,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:49:14,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 02:49:14,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 40: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 3: [2022-11-26 02:49:14,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 40: [2022-11-26 02:49:14,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 3: [2022-11-26 02:49:14,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 40: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 02:49:14,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:49:14,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:49:14,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:49:14,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:49:14,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:49:14,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 15: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 60: [2022-11-26 02:49:14,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:49:14,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:49:14,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 41: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 6: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 36: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 8: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 13: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 36: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 02:49:14,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:49:14,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 58: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 21: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 43: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 30: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 20: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 57: [2022-11-26 02:49:14,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 20: [2022-11-26 02:49:14,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 02:49:14,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 02:49:14,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:49:14,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 32: [2022-11-26 02:49:14,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 44: [2022-11-26 02:49:14,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 11: [2022-11-26 02:49:14,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 44: [2022-11-26 02:49:14,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 62: [2022-11-26 02:49:14,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 02:49:14,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 10: [2022-11-26 02:49:14,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 35: [2022-11-26 02:49:14,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 10: [2022-11-26 02:49:14,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 02:49:14,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 02:49:14,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:49:14,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 02:49:14,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:49:14,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:49:14,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 02:49:14,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 02:49:14,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:49:14,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 15: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 42: [2022-11-26 02:49:14,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 02:49:14,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:49:14,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:49:14,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:49:14,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:49:14,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 02:49:14,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:49:14,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:49:14,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 02:49:14,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:49:14,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 50: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 17: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 50: [2022-11-26 02:49:14,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 27: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 58: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 22: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 38: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 1: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 38: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 1: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 38: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:49:14,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 31: [2022-11-26 02:49:14,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 32: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 33: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 6: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 33: [2022-11-26 02:49:14,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 30: [2022-11-26 02:49:14,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 33: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 62: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 10: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 62: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 8: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 52: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 10: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 52: [2022-11-26 02:49:14,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 8: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 60: [2022-11-26 02:49:14,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 24: [2022-11-26 02:49:14,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:49:14,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 13: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:49:14,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 35: [2022-11-26 02:49:14,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 13: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:49:14,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:49:14,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 02:49:14,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 28: [2022-11-26 02:49:14,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:49:14,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:49:14,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 02:49:14,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 02:49:14,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:49:14,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:49:14,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 02:49:14,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 02:49:14,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 16: [2022-11-26 02:49:14,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 45: [2022-11-26 02:49:14,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 02:49:14,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:49:14,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 02:49:14,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:49:14,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:49:14,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 02:49:14,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 02:49:14,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:49:14,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:49:14,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:49:14,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:49:14,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:49:14,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:49:14,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 02:49:14,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:49:14,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 02:49:14,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:49:14,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 31: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 55: [2022-11-26 02:49:14,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 10: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 22: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 55: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 36: [2022-11-26 02:49:14,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 02:49:14,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 02:49:14,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 02:49:14,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 02:49:14,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 02:49:14,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:49:14,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 61: [2022-11-26 02:49:14,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 02:49:14,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 02:49:14,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:49:14,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 32: [2022-11-26 02:49:14,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 02:49:14,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 02:49:14,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 02:49:14,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 02:49:14,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 02:49:14,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 34: [2022-11-26 02:49:14,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 02:49:14,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 02:49:14,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 9: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 25: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 9: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 25: [2022-11-26 02:49:14,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 53: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 25: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 02:49:14,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 45: [2022-11-26 02:49:14,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 02:49:14,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 02:49:14,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 3: [2022-11-26 02:49:14,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 02:49:14,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 02:49:14,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 02:49:14,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 14: [2022-11-26 02:49:14,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-26 02:49:14,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 14: [2022-11-26 02:49:14,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 5: [2022-11-26 02:49:14,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 02:49:14,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 02:49:14,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 42: [2022-11-26 02:49:14,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 02:49:14,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 02:49:14,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 48: [2022-11-26 02:49:14,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 02:49:14,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 02:49:14,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 18: [2022-11-26 02:49:14,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 54: [2022-11-26 02:49:14,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 56: [2022-11-26 02:49:14,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 16: [2022-11-26 02:49:14,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 56: [2022-11-26 02:49:14,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 02:49:14,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 6: [2022-11-26 02:49:14,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 02:49:14,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 02:49:14,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:49:14,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 7: [2022-11-26 02:49:14,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 02:49:14,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 02:49:14,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 2: [2022-11-26 02:49:14,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 02:49:14,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 02:49:14,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 60: [2022-11-26 02:49:14,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 02:49:14,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 02:49:14,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:49:14,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 41: [2022-11-26 02:49:14,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 02:49:14,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 02:49:14,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 52: [2022-11-26 02:49:14,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 02:49:14,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 02:49:14,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: [2022-11-26 02:49:14,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 02:49:14,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 02:49:14,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 27: [2022-11-26 02:49:14,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 02:49:14,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 02:49:14,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 22: [2022-11-26 02:49:14,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 02:49:14,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 02:49:14,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 35: [2022-11-26 02:49:14,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 02:49:14,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 23: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 02:49:14,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 37: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 02:49:14,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 36: [2022-11-26 02:49:14,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 13: [2022-11-26 02:49:14,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 36: [2022-11-26 02:49:14,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 02:49:14,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 10: [2022-11-26 02:49:14,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 02:49:14,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 59: [2022-11-26 02:49:14,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 02:49:14,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 58: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 02:49:14,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 50: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 02:49:14,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 31: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 02:49:14,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 02:49:14,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 11: [2022-11-26 02:49:14,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 11: [2022-11-26 02:49:14,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 32: [2022-11-26 02:49:14,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 11: [2022-11-26 02:49:14,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 32: [2022-11-26 02:49:14,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 02:49:14,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 53: [2022-11-26 02:49:14,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 02:49:14,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 02:49:14,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 20: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 02:49:14,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 44: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 47: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 57: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 47: [2022-11-26 02:49:14,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 57: [2022-11-26 02:49:14,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 47: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 57: [2022-11-26 02:49:14,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 49: [2022-11-26 02:49:14,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 02:49:14,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 4: [2022-11-26 02:49:14,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 49: [2022-11-26 02:49:14,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 62: [2022-11-26 02:49:14,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 02:49:14,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 02:49:14,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 02:49:14,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 02:49:14,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:49:14,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 33: [2022-11-26 02:49:14,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 02:49:14,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 02:49:14,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 02:49:14,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 02:49:14,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 02:49:14,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 18: [2022-11-26 02:49:14,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 02:49:14,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 02:49:14,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 02:49:14,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 40: [2022-11-26 02:49:14,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 02:49:14,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 02:49:14,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 16: [2022-11-26 02:49:14,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 16: [2022-11-26 02:49:14,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 4: [2022-11-26 02:49:14,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 02:49:14,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 19: [2022-11-26 02:49:14,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 02:49:14,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 02:49:14,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 15: [2022-11-26 02:49:14,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 02:49:14,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 02:49:14,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 51: [2022-11-26 02:49:14,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 02:49:14,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 02:49:14,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:49:14,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 1: [2022-11-26 02:49:14,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 02:49:14,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 26: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 02:49:14,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 24: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 29: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 02:49:14,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 12: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 02:49:14,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 02:49:14,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 46: [2022-11-26 02:49:14,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 02:49:14,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 02:49:14,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 55: [2022-11-26 02:49:14,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 02:49:14,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 02:49:14,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 02:49:14,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 54: [2022-11-26 02:49:14,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 02:49:14,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 39: [2022-11-26 02:49:14,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 02:49:14,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 8: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 8: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 13: [2022-11-26 02:49:14,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 43: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 13: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 43: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 02:49:14,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 02:49:14,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 24: [2022-11-26 02:49:14,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 02:49:14,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 02:49:14,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 21: [2022-11-26 02:49:14,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 02:49:14,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 02:49:14,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 30: [2022-11-26 02:49:14,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 02:49:14,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 02:49:14,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 02:49:14,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 02:49:14,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 17: [2022-11-26 02:49:14,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 38: [2022-11-26 02:49:14,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 17: [2022-11-26 02:49:14,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 38: [2022-11-26 02:49:14,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 02:49:14,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 63: [2022-11-26 02:49:14,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 02:49:14,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step14000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 02:49:14,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! 0: successfully saved checkpoint at iteration 14000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5355.07 63: iteration 14010/ 24424 | consumed samples: 7173120 | consumed tokens: 14690549760 | elapsed time per iteration (s): 2.87 | learning rate: 9.056E-05 | global batch size: 512 | lm loss: 2.060370E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.422 | TFLOPs: 18.37 | 63: iteration 14020/ 24424 | consumed samples: 7178240 | consumed tokens: 14701035520 | elapsed time per iteration (s): 2.80 | learning rate: 9.045E-05 | global batch size: 512 | lm loss: 2.072755E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.006 | TFLOPs: 18.84 | 63: iteration 14030/ 24424 | consumed samples: 7183360 | consumed tokens: 14711521280 | elapsed time per iteration (s): 2.26 | learning rate: 9.033E-05 | global batch size: 512 | lm loss: 2.048983E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.836 | TFLOPs: 23.35 | 63: iteration 14040/ 24424 | consumed samples: 7188480 | consumed tokens: 14722007040 | elapsed time per iteration (s): 2.24 | learning rate: 9.022E-05 | global batch size: 512 | lm loss: 2.077193E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.457 | TFLOPs: 23.52 | 63: iteration 14050/ 24424 | consumed samples: 7193600 | consumed tokens: 14732492800 | elapsed time per iteration (s): 2.30 | learning rate: 9.011E-05 | global batch size: 512 | lm loss: 2.071924E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.088 | TFLOPs: 22.97 | 63: iteration 14060/ 24424 | consumed samples: 7198720 | consumed tokens: 14742978560 | elapsed time per iteration (s): 2.23 | learning rate: 8.999E-05 | global batch size: 512 | lm loss: 2.065168E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.810 | TFLOPs: 23.66 | 63: iteration 14070/ 24424 | consumed samples: 7203840 | consumed tokens: 14753464320 | elapsed time per iteration (s): 2.24 | learning rate: 8.988E-05 | global batch size: 512 | lm loss: 2.084584E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.635 | TFLOPs: 23.54 | 63: iteration 14080/ 24424 | consumed samples: 7208960 | consumed tokens: 14763950080 | elapsed time per iteration (s): 2.25 | learning rate: 8.976E-05 | global batch size: 512 | lm loss: 2.059103E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.225 | TFLOPs: 23.39 | 63: iteration 14090/ 24424 | consumed samples: 7214080 | consumed tokens: 14774435840 | elapsed time per iteration (s): 2.23 | learning rate: 8.965E-05 | global batch size: 512 | lm loss: 2.052582E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.158 | TFLOPs: 23.59 | 63: iteration 14100/ 24424 | consumed samples: 7219200 | consumed tokens: 14784921600 | elapsed time per iteration (s): 2.24 | learning rate: 8.954E-05 | global batch size: 512 | lm loss: 2.052270E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.823 | TFLOPs: 23.56 | 63: iteration 14110/ 24424 | consumed samples: 7224320 | consumed tokens: 14795407360 | elapsed time per iteration (s): 2.23 | learning rate: 8.942E-05 | global batch size: 512 | lm loss: 2.066061E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.805 | TFLOPs: 23.66 | 63: iteration 14120/ 24424 | consumed samples: 7229440 | consumed tokens: 14805893120 | elapsed time per iteration (s): 2.25 | learning rate: 8.931E-05 | global batch size: 512 | lm loss: 2.054496E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.772 | TFLOPs: 23.45 | 63: iteration 14130/ 24424 | consumed samples: 7234560 | consumed tokens: 14816378880 | elapsed time per iteration (s): 2.26 | learning rate: 8.920E-05 | global batch size: 512 | lm loss: 2.061194E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.765 | TFLOPs: 23.34 | 63: iteration 14140/ 24424 | consumed samples: 7239680 | consumed tokens: 14826864640 | elapsed time per iteration (s): 2.25 | learning rate: 8.908E-05 | global batch size: 512 | lm loss: 2.058708E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.684 | TFLOPs: 23.44 | 63: iteration 14150/ 24424 | consumed samples: 7244800 | consumed tokens: 14837350400 | elapsed time per iteration (s): 2.24 | learning rate: 8.897E-05 | global batch size: 512 | lm loss: 2.068366E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.573 | TFLOPs: 23.53 | 63: iteration 14160/ 24424 | consumed samples: 7249920 | consumed tokens: 14847836160 | elapsed time per iteration (s): 2.23 | learning rate: 8.885E-05 | global batch size: 512 | lm loss: 2.102266E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.510 | TFLOPs: 23.63 | 63: iteration 14170/ 24424 | consumed samples: 7255040 | consumed tokens: 14858321920 | elapsed time per iteration (s): 2.27 | learning rate: 8.874E-05 | global batch size: 512 | lm loss: 2.078338E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.361 | TFLOPs: 23.20 | 63: iteration 14180/ 24424 | consumed samples: 7260160 | consumed tokens: 14868807680 | elapsed time per iteration (s): 2.25 | learning rate: 8.863E-05 | global batch size: 512 | lm loss: 2.063870E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.917 | TFLOPs: 23.46 | 63: iteration 14190/ 24424 | consumed samples: 7265280 | consumed tokens: 14879293440 | elapsed time per iteration (s): 2.23 | learning rate: 8.851E-05 | global batch size: 512 | lm loss: 2.063621E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.185 | TFLOPs: 23.59 | 63: iteration 14200/ 24424 | consumed samples: 7270400 | consumed tokens: 14889779200 | elapsed time per iteration (s): 4.80 | learning rate: 8.840E-05 | global batch size: 512 | lm loss: 2.071015E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 106.594 | TFLOPs: 10.97 | 63: iteration 14210/ 24424 | consumed samples: 7275520 | consumed tokens: 14900264960 | elapsed time per iteration (s): 2.23 | learning rate: 8.829E-05 | global batch size: 512 | lm loss: 2.076141E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.087 | TFLOPs: 23.58 | 63: iteration 14220/ 24424 | consumed samples: 7280640 | consumed tokens: 14910750720 | elapsed time per iteration (s): 2.25 | learning rate: 8.817E-05 | global batch size: 512 | lm loss: 2.047132E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.621 | TFLOPs: 23.43 | 63: iteration 14230/ 24424 | consumed samples: 7285760 | consumed tokens: 14921236480 | elapsed time per iteration (s): 2.25 | learning rate: 8.806E-05 | global batch size: 512 | lm loss: 2.050417E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.053 | TFLOPs: 23.37 | 63: iteration 14240/ 24424 | consumed samples: 7290880 | consumed tokens: 14931722240 | elapsed time per iteration (s): 2.24 | learning rate: 8.795E-05 | global batch size: 512 | lm loss: 2.047859E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.299 | TFLOPs: 23.50 | 63: iteration 14250/ 24424 | consumed samples: 7296000 | consumed tokens: 14942208000 | elapsed time per iteration (s): 2.25 | learning rate: 8.783E-05 | global batch size: 512 | lm loss: 2.071111E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.913 | TFLOPs: 23.46 | 63: iteration 14260/ 24424 | consumed samples: 7301120 | consumed tokens: 14952693760 | elapsed time per iteration (s): 2.23 | learning rate: 8.772E-05 | global batch size: 512 | lm loss: 2.075668E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.758 | TFLOPs: 23.65 | 63: iteration 14270/ 24424 | consumed samples: 7306240 | consumed tokens: 14963179520 | elapsed time per iteration (s): 2.27 | learning rate: 8.761E-05 | global batch size: 512 | lm loss: 2.059707E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.177 | TFLOPs: 23.18 | 63: iteration 14280/ 24424 | consumed samples: 7311360 | consumed tokens: 14973665280 | elapsed time per iteration (s): 2.25 | learning rate: 8.749E-05 | global batch size: 512 | lm loss: 2.070193E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.853 | TFLOPs: 23.46 | 63: iteration 14290/ 24424 | consumed samples: 7316480 | consumed tokens: 14984151040 | elapsed time per iteration (s): 2.23 | learning rate: 8.738E-05 | global batch size: 512 | lm loss: 2.055729E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.612 | TFLOPs: 23.64 | 63: iteration 14300/ 24424 | consumed samples: 7321600 | consumed tokens: 14994636800 | elapsed time per iteration (s): 2.23 | learning rate: 8.727E-05 | global batch size: 512 | lm loss: 2.067547E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.561 | TFLOPs: 23.63 | 63: iteration 14310/ 24424 | consumed samples: 7326720 | consumed tokens: 15005122560 | elapsed time per iteration (s): 2.23 | learning rate: 8.715E-05 | global batch size: 512 | lm loss: 2.062017E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.746 | TFLOPs: 23.65 | 63: iteration 14320/ 24424 | consumed samples: 7331840 | consumed tokens: 15015608320 | elapsed time per iteration (s): 2.26 | learning rate: 8.704E-05 | global batch size: 512 | lm loss: 2.069103E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.522 | TFLOPs: 23.32 | 63: iteration 14330/ 24424 | consumed samples: 7336960 | consumed tokens: 15026094080 | elapsed time per iteration (s): 2.92 | learning rate: 8.693E-05 | global batch size: 512 | lm loss: 2.063266E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.391 | TFLOPs: 18.06 | 63: iteration 14340/ 24424 | consumed samples: 7342080 | consumed tokens: 15036579840 | elapsed time per iteration (s): 2.23 | learning rate: 8.681E-05 | global batch size: 512 | lm loss: 2.066525E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.310 | TFLOPs: 23.61 | 63: iteration 14350/ 24424 | consumed samples: 7347200 | consumed tokens: 15047065600 | elapsed time per iteration (s): 2.25 | learning rate: 8.670E-05 | global batch size: 512 | lm loss: 2.063700E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.998 | TFLOPs: 23.47 | 63: iteration 14360/ 24424 | consumed samples: 7352320 | consumed tokens: 15057551360 | elapsed time per iteration (s): 2.24 | learning rate: 8.659E-05 | global batch size: 512 | lm loss: 2.061926E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.867 | TFLOPs: 23.56 | 63: iteration 14370/ 24424 | consumed samples: 7357440 | consumed tokens: 15068037120 | elapsed time per iteration (s): 2.26 | learning rate: 8.648E-05 | global batch size: 512 | lm loss: 2.086148E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.037 | TFLOPs: 23.37 | 63: iteration 14380/ 24424 | consumed samples: 7362560 | consumed tokens: 15078522880 | elapsed time per iteration (s): 2.23 | learning rate: 8.636E-05 | global batch size: 512 | lm loss: 2.077051E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.468 | TFLOPs: 23.62 | 63: iteration 14390/ 24424 | consumed samples: 7367680 | consumed tokens: 15089008640 | elapsed time per iteration (s): 2.25 | learning rate: 8.625E-05 | global batch size: 512 | lm loss: 2.067811E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.067 | TFLOPs: 23.38 | 63: iteration 14400/ 24424 | consumed samples: 7372800 | consumed tokens: 15099494400 | elapsed time per iteration (s): 2.25 | learning rate: 8.614E-05 | global batch size: 512 | lm loss: 2.063795E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.972 | TFLOPs: 23.47 | 63: iteration 14410/ 24424 | consumed samples: 7377920 | consumed tokens: 15109980160 | elapsed time per iteration (s): 2.25 | learning rate: 8.602E-05 | global batch size: 512 | lm loss: 2.047990E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.877 | TFLOPs: 23.46 | 63: iteration 14420/ 24424 | consumed samples: 7383040 | consumed tokens: 15120465920 | elapsed time per iteration (s): 2.25 | learning rate: 8.591E-05 | global batch size: 512 | lm loss: 2.057130E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.062 | TFLOPs: 23.37 | 63: iteration 14430/ 24424 | consumed samples: 7388160 | consumed tokens: 15130951680 | elapsed time per iteration (s): 2.23 | learning rate: 8.580E-05 | global batch size: 512 | lm loss: 2.065712E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.954 | TFLOPs: 23.67 | 63: iteration 14440/ 24424 | consumed samples: 7393280 | consumed tokens: 15141437440 | elapsed time per iteration (s): 2.24 | learning rate: 8.569E-05 | global batch size: 512 | lm loss: 2.063953E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.415 | TFLOPs: 23.51 | 63: iteration 14450/ 24424 | consumed samples: 7398400 | consumed tokens: 15151923200 | elapsed time per iteration (s): 2.23 | learning rate: 8.557E-05 | global batch size: 512 | lm loss: 2.067903E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.847 | TFLOPs: 23.66 | 63: iteration 14460/ 24424 | consumed samples: 7403520 | consumed tokens: 15162408960 | elapsed time per iteration (s): 2.23 | learning rate: 8.546E-05 | global batch size: 512 | lm loss: 2.074840E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.598 | TFLOPs: 23.64 | 63: iteration 14470/ 24424 | consumed samples: 7408640 | consumed tokens: 15172894720 | elapsed time per iteration (s): 2.23 | learning rate: 8.535E-05 | global batch size: 512 | lm loss: 2.071580E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.785 | TFLOPs: 23.66 | 63: iteration 14480/ 24424 | consumed samples: 7413760 | consumed tokens: 15183380480 | elapsed time per iteration (s): 2.23 | learning rate: 8.524E-05 | global batch size: 512 | lm loss: 2.058659E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.677 | TFLOPs: 23.64 | 63: iteration 14490/ 24424 | consumed samples: 7418880 | consumed tokens: 15193866240 | elapsed time per iteration (s): 2.35 | learning rate: 8.512E-05 | global batch size: 512 | lm loss: 2.093662E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.114 | TFLOPs: 22.45 | 63: iteration 14500/ 24424 | consumed samples: 7424000 | consumed tokens: 15204352000 | elapsed time per iteration (s): 2.25 | learning rate: 8.501E-05 | global batch size: 512 | lm loss: 2.058505E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.822 | TFLOPs: 23.45 | 63: iteration 14510/ 24424 | consumed samples: 7429120 | consumed tokens: 15214837760 | elapsed time per iteration (s): 3.47 | learning rate: 8.490E-05 | global batch size: 512 | lm loss: 2.046647E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 147.648 | TFLOPs: 15.20 | 63: iteration 14520/ 24424 | consumed samples: 7434240 | consumed tokens: 15225323520 | elapsed time per iteration (s): 2.26 | learning rate: 8.479E-05 | global batch size: 512 | lm loss: 2.051604E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.642 | TFLOPs: 23.33 | 63: iteration 14530/ 24424 | consumed samples: 7439360 | consumed tokens: 15235809280 | elapsed time per iteration (s): 2.23 | learning rate: 8.467E-05 | global batch size: 512 | lm loss: 2.070587E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.371 | TFLOPs: 23.61 | 63: iteration 14540/ 24424 | consumed samples: 7444480 | consumed tokens: 15246295040 | elapsed time per iteration (s): 2.23 | learning rate: 8.456E-05 | global batch size: 512 | lm loss: 2.063720E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.059 | TFLOPs: 23.68 | 63: iteration 14550/ 24424 | consumed samples: 7449600 | consumed tokens: 15256780800 | elapsed time per iteration (s): 2.26 | learning rate: 8.445E-05 | global batch size: 512 | lm loss: 2.051170E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.256 | TFLOPs: 23.29 | 63: iteration 14560/ 24424 | consumed samples: 7454720 | consumed tokens: 15267266560 | elapsed time per iteration (s): 2.25 | learning rate: 8.434E-05 | global batch size: 512 | lm loss: 2.052859E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.815 | TFLOPs: 23.45 | 63: iteration 14570/ 24424 | consumed samples: 7459840 | consumed tokens: 15277752320 | elapsed time per iteration (s): 2.26 | learning rate: 8.423E-05 | global batch size: 512 | lm loss: 2.054486E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.616 | TFLOPs: 23.33 | 63: iteration 14580/ 24424 | consumed samples: 7464960 | consumed tokens: 15288238080 | elapsed time per iteration (s): 2.26 | learning rate: 8.411E-05 | global batch size: 512 | lm loss: 2.063634E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.166 | TFLOPs: 23.28 | 63: iteration 14590/ 24424 | consumed samples: 7470080 | consumed tokens: 15298723840 | elapsed time per iteration (s): 2.28 | learning rate: 8.400E-05 | global batch size: 512 | lm loss: 2.058920E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.421 | TFLOPs: 23.10 | 63: iteration 14600/ 24424 | consumed samples: 7475200 | consumed tokens: 15309209600 | elapsed time per iteration (s): 2.24 | learning rate: 8.389E-05 | global batch size: 512 | lm loss: 2.038128E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.622 | TFLOPs: 23.54 | 63: iteration 14610/ 24424 | consumed samples: 7480320 | consumed tokens: 15319695360 | elapsed time per iteration (s): 2.24 | learning rate: 8.378E-05 | global batch size: 512 | lm loss: 2.071233E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.266 | TFLOPs: 23.50 | 63: iteration 14620/ 24424 | consumed samples: 7485440 | consumed tokens: 15330181120 | elapsed time per iteration (s): 2.24 | learning rate: 8.367E-05 | global batch size: 512 | lm loss: 2.044726E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.876 | TFLOPs: 23.56 | 63: iteration 14630/ 24424 | consumed samples: 7490560 | consumed tokens: 15340666880 | elapsed time per iteration (s): 2.24 | learning rate: 8.355E-05 | global batch size: 512 | lm loss: 2.042936E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.912 | TFLOPs: 23.57 | 63: iteration 14640/ 24424 | consumed samples: 7495680 | consumed tokens: 15351152640 | elapsed time per iteration (s): 2.65 | learning rate: 8.344E-05 | global batch size: 512 | lm loss: 2.065201E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 192.969 | TFLOPs: 19.87 | 63: iteration 14650/ 24424 | consumed samples: 7500800 | consumed tokens: 15361638400 | elapsed time per iteration (s): 2.28 | learning rate: 8.333E-05 | global batch size: 512 | lm loss: 2.058727E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.141 | TFLOPs: 23.07 | 63: iteration 14660/ 24424 | consumed samples: 7505920 | consumed tokens: 15372124160 | elapsed time per iteration (s): 2.23 | learning rate: 8.322E-05 | global batch size: 512 | lm loss: 2.057859E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.310 | TFLOPs: 23.61 | 63: iteration 14670/ 24424 | consumed samples: 7511040 | consumed tokens: 15382609920 | elapsed time per iteration (s): 2.31 | learning rate: 8.311E-05 | global batch size: 512 | lm loss: 2.044159E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.376 | TFLOPs: 22.79 | 63: iteration 14680/ 24424 | consumed samples: 7516160 | consumed tokens: 15393095680 | elapsed time per iteration (s): 2.27 | learning rate: 8.300E-05 | global batch size: 512 | lm loss: 2.055628E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.917 | TFLOPs: 23.26 | 63: iteration 14690/ 24424 | consumed samples: 7521280 | consumed tokens: 15403581440 | elapsed time per iteration (s): 2.25 | learning rate: 8.289E-05 | global batch size: 512 | lm loss: 2.061184E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.969 | TFLOPs: 23.47 | 63: iteration 14700/ 24424 | consumed samples: 7526400 | consumed tokens: 15414067200 | elapsed time per iteration (s): 2.33 | learning rate: 8.277E-05 | global batch size: 512 | lm loss: 2.067650E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.868 | TFLOPs: 22.63 | 63: iteration 14710/ 24424 | consumed samples: 7531520 | consumed tokens: 15424552960 | elapsed time per iteration (s): 2.23 | learning rate: 8.266E-05 | global batch size: 512 | lm loss: 2.017698E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.374 | TFLOPs: 23.61 | 63: iteration 14720/ 24424 | consumed samples: 7536640 | consumed tokens: 15435038720 | elapsed time per iteration (s): 2.27 | learning rate: 8.255E-05 | global batch size: 512 | lm loss: 2.068635E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.012 | TFLOPs: 23.27 | 63: iteration 14730/ 24424 | consumed samples: 7541760 | consumed tokens: 15445524480 | elapsed time per iteration (s): 2.24 | learning rate: 8.244E-05 | global batch size: 512 | lm loss: 2.066073E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.874 | TFLOPs: 23.56 | 63: iteration 14740/ 24424 | consumed samples: 7546880 | consumed tokens: 15456010240 | elapsed time per iteration (s): 2.25 | learning rate: 8.233E-05 | global batch size: 512 | lm loss: 2.070351E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.794 | TFLOPs: 23.45 | 63: iteration 14750/ 24424 | consumed samples: 7552000 | consumed tokens: 15466496000 | elapsed time per iteration (s): 2.26 | learning rate: 8.222E-05 | global batch size: 512 | lm loss: 2.061403E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.709 | TFLOPs: 23.34 | 63: iteration 14760/ 24424 | consumed samples: 7557120 | consumed tokens: 15476981760 | elapsed time per iteration (s): 2.23 | learning rate: 8.211E-05 | global batch size: 512 | lm loss: 2.070139E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.688 | TFLOPs: 23.65 | 63: iteration 14770/ 24424 | consumed samples: 7562240 | consumed tokens: 15487467520 | elapsed time per iteration (s): 2.24 | learning rate: 8.199E-05 | global batch size: 512 | lm loss: 2.055959E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.342 | TFLOPs: 23.51 | 63: iteration 14780/ 24424 | consumed samples: 7567360 | consumed tokens: 15497953280 | elapsed time per iteration (s): 2.24 | learning rate: 8.188E-05 | global batch size: 512 | lm loss: 2.041867E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.275 | TFLOPs: 23.50 | 63: iteration 14790/ 24424 | consumed samples: 7572480 | consumed tokens: 15508439040 | elapsed time per iteration (s): 2.25 | learning rate: 8.177E-05 | global batch size: 512 | lm loss: 2.056766E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.790 | TFLOPs: 23.45 | 63: iteration 14800/ 24424 | consumed samples: 7577600 | consumed tokens: 15518924800 | elapsed time per iteration (s): 2.26 | learning rate: 8.166E-05 | global batch size: 512 | lm loss: 2.061302E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.102 | TFLOPs: 23.28 | 63: iteration 14810/ 24424 | consumed samples: 7582720 | consumed tokens: 15529410560 | elapsed time per iteration (s): 2.25 | learning rate: 8.155E-05 | global batch size: 512 | lm loss: 2.049279E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.195 | TFLOPs: 23.39 | 63: iteration 14820/ 24424 | consumed samples: 7587840 | consumed tokens: 15539896320 | elapsed time per iteration (s): 4.06 | learning rate: 8.144E-05 | global batch size: 512 | lm loss: 2.045788E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 126.100 | TFLOPs: 12.98 | 63: iteration 14830/ 24424 | consumed samples: 7592960 | consumed tokens: 15550382080 | elapsed time per iteration (s): 2.23 | learning rate: 8.133E-05 | global batch size: 512 | lm loss: 2.047765E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.708 | TFLOPs: 23.65 | 63: iteration 14840/ 24424 | consumed samples: 7598080 | consumed tokens: 15560867840 | elapsed time per iteration (s): 2.26 | learning rate: 8.122E-05 | global batch size: 512 | lm loss: 2.063287E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.097 | TFLOPs: 23.28 | 63: iteration 14850/ 24424 | consumed samples: 7603200 | consumed tokens: 15571353600 | elapsed time per iteration (s): 2.23 | learning rate: 8.111E-05 | global batch size: 512 | lm loss: 2.035566E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.218 | TFLOPs: 23.60 | 63: iteration 14860/ 24424 | consumed samples: 7608320 | consumed tokens: 15581839360 | elapsed time per iteration (s): 2.23 | learning rate: 8.100E-05 | global batch size: 512 | lm loss: 2.050245E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.103 | TFLOPs: 23.69 | 63: iteration 14870/ 24424 | consumed samples: 7613440 | consumed tokens: 15592325120 | elapsed time per iteration (s): 2.24 | learning rate: 8.089E-05 | global batch size: 512 | lm loss: 2.065652E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.261 | TFLOPs: 23.50 | 63: iteration 14880/ 24424 | consumed samples: 7618560 | consumed tokens: 15602810880 | elapsed time per iteration (s): 2.22 | learning rate: 8.078E-05 | global batch size: 512 | lm loss: 2.044108E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.120 | TFLOPs: 23.69 | 63: iteration 14890/ 24424 | consumed samples: 7623680 | consumed tokens: 15613296640 | elapsed time per iteration (s): 2.26 | learning rate: 8.066E-05 | global batch size: 512 | lm loss: 2.032436E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.826 | TFLOPs: 23.35 | 63: iteration 14900/ 24424 | consumed samples: 7628800 | consumed tokens: 15623782400 | elapsed time per iteration (s): 2.23 | learning rate: 8.055E-05 | global batch size: 512 | lm loss: 2.068179E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.413 | TFLOPs: 23.62 | 63: iteration 14910/ 24424 | consumed samples: 7633920 | consumed tokens: 15634268160 | elapsed time per iteration (s): 2.25 | learning rate: 8.044E-05 | global batch size: 512 | lm loss: 2.039311E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.339 | TFLOPs: 23.40 | 63: iteration 14920/ 24424 | consumed samples: 7639040 | consumed tokens: 15644753920 | elapsed time per iteration (s): 2.24 | learning rate: 8.033E-05 | global batch size: 512 | lm loss: 2.047825E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.550 | TFLOPs: 23.53 | 63: iteration 14930/ 24424 | consumed samples: 7644160 | consumed tokens: 15655239680 | elapsed time per iteration (s): 2.24 | learning rate: 8.022E-05 | global batch size: 512 | lm loss: 2.032841E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.375 | TFLOPs: 23.51 | 63: iteration 14940/ 24424 | consumed samples: 7649280 | consumed tokens: 15665725440 | elapsed time per iteration (s): 2.25 | learning rate: 8.011E-05 | global batch size: 512 | lm loss: 2.041488E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.618 | TFLOPs: 23.43 | 63: iteration 14950/ 24424 | consumed samples: 7654400 | consumed tokens: 15676211200 | elapsed time per iteration (s): 2.64 | learning rate: 8.000E-05 | global batch size: 512 | lm loss: 2.060214E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 193.935 | TFLOPs: 19.96 | 63: iteration 14960/ 24424 | consumed samples: 7659520 | consumed tokens: 15686696960 | elapsed time per iteration (s): 2.26 | learning rate: 7.989E-05 | global batch size: 512 | lm loss: 2.050229E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.017 | TFLOPs: 23.37 | 63: iteration 14970/ 24424 | consumed samples: 7664640 | consumed tokens: 15697182720 | elapsed time per iteration (s): 2.24 | learning rate: 7.978E-05 | global batch size: 512 | lm loss: 2.034474E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.007 | TFLOPs: 23.58 | 63: iteration 14980/ 24424 | consumed samples: 7669760 | consumed tokens: 15707668480 | elapsed time per iteration (s): 2.23 | learning rate: 7.967E-05 | global batch size: 512 | lm loss: 2.054015E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.544 | TFLOPs: 23.63 | 63: iteration 14990/ 24424 | consumed samples: 7674880 | consumed tokens: 15718154240 | elapsed time per iteration (s): 2.24 | learning rate: 7.956E-05 | global batch size: 512 | lm loss: 2.039359E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.099 | TFLOPs: 23.48 | 63: iteration 15000/ 24424 | consumed samples: 7680000 | consumed tokens: 15728640000 | elapsed time per iteration (s): 2.25 | learning rate: 7.945E-05 | global batch size: 512 | lm loss: 2.061407E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.459 | TFLOPs: 23.42 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 15000 | lm loss value: 2.108397E+00 | lm loss PPL: 8.235030E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 15000 to checkpoints_3b9 0: [2022-11-26 03:27:59,693] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step15000 is begin to save! 0: [2022-11-26 03:27:59,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_01-model_00-model_states.pt... 32: [2022-11-26 03:27:59,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_21-model_00-model_states.pt... 32: [2022-11-26 03:28:00,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_21-model_00-model_states.pt. 32: [2022-11-26 03:28:00,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_22-model_00-model_states.pt... 0: [2022-11-26 03:28:00,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_01-model_00-model_states.pt. 0: [2022-11-26 03:28:00,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_03-model_00-model_states.pt... 32: [2022-11-26 03:28:00,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_22-model_00-model_states.pt. 32: [2022-11-26 03:28:00,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_23-model_00-model_states.pt... 0: [2022-11-26 03:28:00,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_03-model_00-model_states.pt. 0: [2022-11-26 03:28:00,293] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_04-model_00-model_states.pt... 32: [2022-11-26 03:28:00,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_23-model_00-model_states.pt. 32: [2022-11-26 03:28:00,465] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_24-model_00-model_states.pt... 0: [2022-11-26 03:28:00,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_04-model_00-model_states.pt. 0: [2022-11-26 03:28:00,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_05-model_00-model_states.pt... 32: [2022-11-26 03:28:00,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_24-model_00-model_states.pt. 32: [2022-11-26 03:28:00,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_25-model_00-model_states.pt... 0: [2022-11-26 03:28:00,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_05-model_00-model_states.pt. 0: [2022-11-26 03:28:00,767] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_06-model_00-model_states.pt... 32: [2022-11-26 03:28:00,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_25-model_00-model_states.pt. 32: [2022-11-26 03:28:00,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_26-model_00-model_states.pt... 0: [2022-11-26 03:28:00,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_06-model_00-model_states.pt. 0: [2022-11-26 03:28:00,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_07-model_00-model_states.pt... 32: [2022-11-26 03:28:01,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_26-model_00-model_states.pt. 32: [2022-11-26 03:28:01,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_27-model_00-model_states.pt... 0: [2022-11-26 03:28:01,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_07-model_00-model_states.pt. 0: [2022-11-26 03:28:01,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_08-model_00-model_states.pt... 32: [2022-11-26 03:28:01,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_27-model_00-model_states.pt. 32: [2022-11-26 03:28:01,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_28-model_00-model_states.pt... 0: [2022-11-26 03:28:01,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_08-model_00-model_states.pt. 0: [2022-11-26 03:28:01,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_09-model_00-model_states.pt... 32: [2022-11-26 03:28:01,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_28-model_00-model_states.pt. 32: [2022-11-26 03:28:01,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_29-model_00-model_states.pt... 0: [2022-11-26 03:28:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_09-model_00-model_states.pt. 0: [2022-11-26 03:28:01,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_10-model_00-model_states.pt... 0: [2022-11-26 03:28:01,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_10-model_00-model_states.pt. 0: [2022-11-26 03:28:01,853] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_11-model_00-model_states.pt... 32: [2022-11-26 03:28:01,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_29-model_00-model_states.pt. 32: [2022-11-26 03:28:01,855] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_30-model_00-model_states.pt... 0: [2022-11-26 03:28:02,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_11-model_00-model_states.pt. 0: [2022-11-26 03:28:02,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_12-model_00-model_states.pt... 32: [2022-11-26 03:28:02,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_30-model_00-model_states.pt. 32: [2022-11-26 03:28:02,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_31-model_00-model_states.pt... 0: [2022-11-26 03:28:02,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_12-model_00-model_states.pt. 0: [2022-11-26 03:28:02,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_13-model_00-model_states.pt... 32: [2022-11-26 03:28:02,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_31-model_00-model_states.pt. 32: [2022-11-26 03:28:02,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_32-model_00-model_states.pt... 0: [2022-11-26 03:28:02,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_13-model_00-model_states.pt. 0: [2022-11-26 03:28:02,506] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_14-model_00-model_states.pt... 32: [2022-11-26 03:28:02,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_32-model_00-model_states.pt. 32: [2022-11-26 03:28:02,530] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_33-model_00-model_states.pt... 0: [2022-11-26 03:28:02,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_14-model_00-model_states.pt. 0: [2022-11-26 03:28:02,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_15-model_00-model_states.pt... 32: [2022-11-26 03:28:02,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_33-model_00-model_states.pt. 32: [2022-11-26 03:28:02,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_34-model_00-model_states.pt... 0: [2022-11-26 03:28:02,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_15-model_00-model_states.pt. 0: [2022-11-26 03:28:02,940] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_16-model_00-model_states.pt... 32: [2022-11-26 03:28:02,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_34-model_00-model_states.pt. 32: [2022-11-26 03:28:02,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_35-model_00-model_states.pt... 0: [2022-11-26 03:28:03,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_16-model_00-model_states.pt. 0: [2022-11-26 03:28:03,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_17-model_00-model_states.pt... 32: [2022-11-26 03:28:03,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_35-model_00-model_states.pt. 32: [2022-11-26 03:28:03,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_36-model_00-model_states.pt... 0: [2022-11-26 03:28:03,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_17-model_00-model_states.pt. 0: [2022-11-26 03:28:03,372] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_18-model_00-model_states.pt... 32: [2022-11-26 03:28:03,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_36-model_00-model_states.pt. 32: [2022-11-26 03:28:03,427] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_37-model_00-model_states.pt... 0: [2022-11-26 03:28:03,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_18-model_00-model_states.pt. 0: [2022-11-26 03:28:03,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_19-model_00-model_states.pt... 32: [2022-11-26 03:28:03,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_37-model_00-model_states.pt. 32: [2022-11-26 03:28:03,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_38-model_00-model_states.pt... 0: [2022-11-26 03:28:03,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_19-model_00-model_states.pt. 0: [2022-11-26 03:28:03,806] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_20-model_00-model_states.pt... 32: [2022-11-26 03:28:03,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_38-model_00-model_states.pt. 32: [2022-11-26 03:28:03,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/layer_40-model_00-model_states.pt... 32: [2022-11-26 03:28:03,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_40-model_00-model_states.pt. 32: [2022-11-26 03:28:03,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/mp_rank_01_model_states.pt... 32: [2022-11-26 03:28:03,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/mp_rank_01_model_states.pt. 0: [2022-11-26 03:28:04,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/layer_20-model_00-model_states.pt. 0: [2022-11-26 03:28:04,025] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step15000/mp_rank_00_model_states.pt 0: [2022-11-26 03:28:04,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/mp_rank_00_model_states.pt... 0: [2022-11-26 03:28:04,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/mp_rank_00_model_states.pt. 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 55: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 53: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 61: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 54: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 40: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 47: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 39: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 16: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 20: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 18: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 9: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 29: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 0: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 25: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 19: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 31: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 3: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 6: [2022-11-26 03:28:04,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 14: [2022-11-26 03:28:04,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 03:28:04,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 03:28:04,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 03:28:04,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 03:28:04,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 03:28:04,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 03:28:04,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 41: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 38: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 31: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 33: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 31: [2022-11-26 03:28:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 33: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 03:28:04,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 03:28:04,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 10: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 10: [2022-11-26 03:28:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 55: [2022-11-26 03:28:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 10: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 55: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 03:28:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 3: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 12: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 44: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 0: [2022-11-26 03:28:04,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 44: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 0: [2022-11-26 03:28:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 03:28:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 14: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 42: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 14: [2022-11-26 03:28:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 03:28:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 03:28:04,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 7: [2022-11-26 03:28:04,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 26: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 54: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 15: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 31: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 31: [2022-11-26 03:28:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 03:28:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 03:28:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 47: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 21: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 33: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 27: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 03:28:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 33: [2022-11-26 03:28:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 27: [2022-11-26 03:28:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 33: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 03:28:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 40: [2022-11-26 03:28:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 23: [2022-11-26 03:28:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 03:28:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 03:28:04,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 54: [2022-11-26 03:28:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 44: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 55: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 49: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 30: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 49: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 03:28:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 0: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 03:28:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 41: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 22: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 1: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 37: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 03:28:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 1: [2022-11-26 03:28:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 3: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 51: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 3: [2022-11-26 03:28:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 25: [2022-11-26 03:28:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 53: [2022-11-26 03:28:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 03:28:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 03:28:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 42: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 20: [2022-11-26 03:28:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 15: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 42: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 15: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 03:28:04,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 33: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 54: [2022-11-26 03:28:04,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 03:28:04,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 33: [2022-11-26 03:28:04,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 54: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 11: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 50: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 11: [2022-11-26 03:28:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 34: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 27: [2022-11-26 03:28:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 03:28:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 03:28:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 46: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 23: [2022-11-26 03:28:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 46: [2022-11-26 03:28:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 03:28:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 23: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 26: [2022-11-26 03:28:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 32: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 30: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 36: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 36: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 03:28:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 57: [2022-11-26 03:28:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 24: [2022-11-26 03:28:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 57: [2022-11-26 03:28:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 03:28:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 17: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 55: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 03:28:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 42: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 42: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 22: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 42: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 16: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 03:28:04,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 03:28:04,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 14: [2022-11-26 03:28:04,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 15: [2022-11-26 03:28:04,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 7: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 45: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 7: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 43: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 6: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 43: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 03:28:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 41: [2022-11-26 03:28:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 36: [2022-11-26 03:28:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 03:28:04,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 03:28:04,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 03:28:04,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 03:28:04,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 03:28:04,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 03:28:04,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 03:28:04,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 03:28:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 03:28:04,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 14: [2022-11-26 03:28:04,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 03:28:04,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 03:28:04,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 03:28:04,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 38: [2022-11-26 03:28:04,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 03:28:04,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 03:28:04,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 15: [2022-11-26 03:28:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 03:28:04,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 03:28:04,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 3: [2022-11-26 03:28:04,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 03:28:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 03:28:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 03:28:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 03:28:04,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 03:28:04,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 03:28:04,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 03:28:04,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 03:28:04,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 03:28:04,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 03:28:04,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 03:28:04,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 03:28:04,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 03:28:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 03:28:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 03:28:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 03:28:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 03:28:04,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 03:28:04,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 03:28:04,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 03:28:04,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 11: [2022-11-26 03:28:04,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 03:28:04,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 03:28:04,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 03:28:04,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 03:28:04,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 03:28:04,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 03:28:04,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 36: [2022-11-26 03:28:04,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 03:28:04,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 03:28:04,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 03:28:04,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 03:28:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 31: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 22: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 42: [2022-11-26 03:28:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 31: [2022-11-26 03:28:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 03:28:04,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 03:28:04,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 03:28:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 03:28:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 15: [2022-11-26 03:28:04,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 03:28:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 03:28:04,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 03:28:04,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 03:28:04,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 03:28:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 03:28:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 3: [2022-11-26 03:28:04,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 03:28:04,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 03:28:04,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 03:28:04,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 03:28:04,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 03:28:04,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 03:28:04,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 03:28:04,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 03:28:04,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 03:28:04,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 03:28:04,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 03:28:04,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 56: [2022-11-26 03:28:04,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 7: [2022-11-26 03:28:04,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 03:28:04,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 03:28:04,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 03:28:04,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 26: [2022-11-26 03:28:04,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 33: [2022-11-26 03:28:04,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 03:28:04,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 03:28:04,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 03:28:04,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 03:28:04,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 03:28:04,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 03:28:04,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 03:28:04,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 03:28:04,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 03:28:04,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 03:28:04,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 03:28:04,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 03:28:04,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 03:28:04,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 10: [2022-11-26 03:28:04,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 60: [2022-11-26 03:28:04,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 38: [2022-11-26 03:28:04,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 38: [2022-11-26 03:28:04,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 10: [2022-11-26 03:28:04,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 03:28:04,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 36: [2022-11-26 03:28:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 0: [2022-11-26 03:28:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 36: [2022-11-26 03:28:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 03:28:04,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 14: [2022-11-26 03:28:04,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 48: [2022-11-26 03:28:04,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 14: [2022-11-26 03:28:04,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 48: [2022-11-26 03:28:04,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 14: [2022-11-26 03:28:04,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 03:28:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 17: [2022-11-26 03:28:04,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 49: [2022-11-26 03:28:04,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 17: [2022-11-26 03:28:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 15: [2022-11-26 03:28:04,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 29: [2022-11-26 03:28:04,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 54: [2022-11-26 03:28:04,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 29: [2022-11-26 03:28:04,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 3: [2022-11-26 03:28:04,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 03:28:04,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 03:28:04,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 03:28:04,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 03:28:04,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 03:28:04,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 03:28:04,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 03:28:04,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 03:28:04,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 03:28:04,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 03:28:04,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 27: [2022-11-26 03:28:04,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 53: [2022-11-26 03:28:04,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 27: [2022-11-26 03:28:04,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 53: [2022-11-26 03:28:04,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 03:28:04,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 03:28:04,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 03:28:04,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 03:28:04,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 03:28:04,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 03:28:04,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 03:28:04,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 03:28:04,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 03:28:04,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 03:28:04,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 03:28:04,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 03:28:04,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 03:28:04,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 03:28:04,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 03:28:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 03:28:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 28: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 54: [2022-11-26 03:28:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 28: [2022-11-26 03:28:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 38: [2022-11-26 03:28:04,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 1: [2022-11-26 03:28:04,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 38: [2022-11-26 03:28:04,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 1: [2022-11-26 03:28:04,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-26 03:28:04,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 03:28:04,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 03:28:04,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 03:28:04,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 03:28:04,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 14: [2022-11-26 03:28:04,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 03:28:04,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 03:28:04,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 03:28:04,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 03:28:04,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 13: [2022-11-26 03:28:04,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 03:28:04,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 31: [2022-11-26 03:28:04,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 03:28:04,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 03:28:04,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 60: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 26: [2022-11-26 03:28:04,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 36: [2022-11-26 03:28:04,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 15: [2022-11-26 03:28:04,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 03:28:04,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 03:28:04,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 03:28:04,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 03:28:04,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 03:28:04,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 03:28:04,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 3: [2022-11-26 03:28:04,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 21: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 49: [2022-11-26 03:28:04,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 21: [2022-11-26 03:28:04,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 49: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 21: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 03:28:04,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 03:28:04,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 03:28:04,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 03:28:04,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 03:28:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 03:28:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 03:28:04,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 03:28:04,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 03:28:04,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 03:28:04,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 03:28:04,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 30: [2022-11-26 03:28:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 52: [2022-11-26 03:28:04,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 30: [2022-11-26 03:28:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 03:28:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 03:28:04,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 03:28:04,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 03:28:04,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 28: [2022-11-26 03:28:04,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 8: [2022-11-26 03:28:04,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 28: [2022-11-26 03:28:04,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 03:28:04,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 03:28:04,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 03:28:04,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 03:28:04,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 03:28:04,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 03:28:04,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 03:28:04,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: [2022-11-26 03:28:04,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 03:28:04,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 03:28:04,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 16: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 24: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 46: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 24: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 46: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 2: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 24: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 46: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 2: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 44: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 2: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 29: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 29: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 12: [2022-11-26 03:28:04,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 03:28:04,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 9: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 8: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 32: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 8: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 32: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 61: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 32: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 61: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 31: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 32: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 61: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 54: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 13: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 54: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 31: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 1: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 13: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 15: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 13: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 31: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 15: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 27: [2022-11-26 03:28:04,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 3: [2022-11-26 03:28:04,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 7: [2022-11-26 03:28:04,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 40: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 03:28:04,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 03:28:04,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 43: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 43: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 60: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 38: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 58: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 60: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 1: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 38: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 58: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 60: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 38: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 58: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 1: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 21: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 1: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 21: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 51: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 53: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 21: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 41: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 41: [2022-11-26 03:28:04,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 53: [2022-11-26 03:28:04,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 63: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 62: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 63: [2022-11-26 03:28:04,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 23: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 63: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 62: [2022-11-26 03:28:04,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 23: [2022-11-26 03:28:04,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 03:28:04,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 33: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 03:28:04,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 22: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 33: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 5: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 39: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 5: [2022-11-26 03:28:04,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 39: [2022-11-26 03:28:04,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 03:28:04,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 11: [2022-11-26 03:28:04,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 03:28:04,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 03:28:04,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 4: [2022-11-26 03:28:04,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 03:28:04,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 03:28:04,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 17: [2022-11-26 03:28:04,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 03:28:04,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 03:28:04,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 34: [2022-11-26 03:28:04,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 26: [2022-11-26 03:28:04,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 34: [2022-11-26 03:28:04,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 03:28:04,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 26: [2022-11-26 03:28:04,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 03:28:04,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 56: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 22: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 56: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 22: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 6: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 52: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 52: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 50: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 18: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 42: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 03:28:04,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 20: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 03:28:04,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 25: [2022-11-26 03:28:04,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 03:28:04,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 03:28:04,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 49: [2022-11-26 03:28:04,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 03:28:04,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 03:28:04,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 19: [2022-11-26 03:28:04,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 03:28:04,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 03:28:04,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 37: [2022-11-26 03:28:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 03:28:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 03:28:04,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 46: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 10: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 46: [2022-11-26 03:28:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 10: [2022-11-26 03:28:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 46: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 10: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 48: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 03:28:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 30: [2022-11-26 03:28:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 45: [2022-11-26 03:28:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 30: [2022-11-26 03:28:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 03:28:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 44: [2022-11-26 03:28:04,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 03:28:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 03:28:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 35: [2022-11-26 03:28:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 03:28:04,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 03:28:04,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 45: [2022-11-26 03:28:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 03:28:04,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 03:28:04,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 59: [2022-11-26 03:28:04,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 03:28:04,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 03:28:04,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 57: [2022-11-26 03:28:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 03:28:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 03:28:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 36: [2022-11-26 03:28:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 03:28:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 03:28:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 03:28:04,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 03:28:04,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 03:28:04,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 47: [2022-11-26 03:28:04,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 55: [2022-11-26 03:28:04,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 03:28:04,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 03:28:04,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 03:28:04,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 14: [2022-11-26 03:28:04,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 03:28:04,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 03:28:04,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 51: [2022-11-26 03:28:04,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 03:28:04,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step15000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 03:28:04,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! 0: successfully saved checkpoint at iteration 15000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5082.37 63: iteration 15010/ 24424 | consumed samples: 7685120 | consumed tokens: 15739125760 | elapsed time per iteration (s): 2.86 | learning rate: 7.934E-05 | global batch size: 512 | lm loss: 2.052665E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.807 | TFLOPs: 18.41 | 63: iteration 15020/ 24424 | consumed samples: 7690240 | consumed tokens: 15749611520 | elapsed time per iteration (s): 2.25 | learning rate: 7.923E-05 | global batch size: 512 | lm loss: 2.051077E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.196 | TFLOPs: 23.39 | 63: iteration 15030/ 24424 | consumed samples: 7695360 | consumed tokens: 15760097280 | elapsed time per iteration (s): 2.27 | learning rate: 7.912E-05 | global batch size: 512 | lm loss: 2.060811E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.305 | TFLOPs: 23.19 | 63: iteration 15040/ 24424 | consumed samples: 7700480 | consumed tokens: 15770583040 | elapsed time per iteration (s): 2.26 | learning rate: 7.901E-05 | global batch size: 512 | lm loss: 2.060214E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.911 | TFLOPs: 23.36 | 63: iteration 15050/ 24424 | consumed samples: 7705600 | consumed tokens: 15781068800 | elapsed time per iteration (s): 2.23 | learning rate: 7.890E-05 | global batch size: 512 | lm loss: 2.037766E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.772 | TFLOPs: 23.65 | 63: iteration 15060/ 24424 | consumed samples: 7710720 | consumed tokens: 15791554560 | elapsed time per iteration (s): 2.26 | learning rate: 7.879E-05 | global batch size: 512 | lm loss: 2.051027E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.423 | TFLOPs: 23.31 | 63: iteration 15070/ 24424 | consumed samples: 7715840 | consumed tokens: 15802040320 | elapsed time per iteration (s): 2.24 | learning rate: 7.868E-05 | global batch size: 512 | lm loss: 2.052453E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.068 | TFLOPs: 23.58 | 63: iteration 15080/ 24424 | consumed samples: 7720960 | consumed tokens: 15812526080 | elapsed time per iteration (s): 2.23 | learning rate: 7.857E-05 | global batch size: 512 | lm loss: 2.056261E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.105 | TFLOPs: 23.59 | 63: iteration 15090/ 24424 | consumed samples: 7726080 | consumed tokens: 15823011840 | elapsed time per iteration (s): 2.23 | learning rate: 7.846E-05 | global batch size: 512 | lm loss: 2.049784E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.602 | TFLOPs: 23.64 | 63: iteration 15100/ 24424 | consumed samples: 7731200 | consumed tokens: 15833497600 | elapsed time per iteration (s): 2.23 | learning rate: 7.835E-05 | global batch size: 512 | lm loss: 2.047641E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.232 | TFLOPs: 23.60 | 63: iteration 15110/ 24424 | consumed samples: 7736320 | consumed tokens: 15843983360 | elapsed time per iteration (s): 2.63 | learning rate: 7.824E-05 | global batch size: 512 | lm loss: 2.039471E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 194.929 | TFLOPs: 20.07 | 63: iteration 15120/ 24424 | consumed samples: 7741440 | consumed tokens: 15854469120 | elapsed time per iteration (s): 2.24 | learning rate: 7.814E-05 | global batch size: 512 | lm loss: 2.034060E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.360 | TFLOPs: 23.51 | 63: iteration 15130/ 24424 | consumed samples: 7746560 | consumed tokens: 15864954880 | elapsed time per iteration (s): 2.24 | learning rate: 7.803E-05 | global batch size: 512 | lm loss: 2.073307E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.369 | TFLOPs: 23.51 | 63: iteration 15140/ 24424 | consumed samples: 7751680 | consumed tokens: 15875440640 | elapsed time per iteration (s): 2.26 | learning rate: 7.792E-05 | global batch size: 512 | lm loss: 2.059005E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.473 | TFLOPs: 23.31 | 63: iteration 15150/ 24424 | consumed samples: 7756800 | consumed tokens: 15885926400 | elapsed time per iteration (s): 2.24 | learning rate: 7.781E-05 | global batch size: 512 | lm loss: 2.049009E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.937 | TFLOPs: 23.57 | 63: iteration 15160/ 24424 | consumed samples: 7761920 | consumed tokens: 15896412160 | elapsed time per iteration (s): 2.24 | learning rate: 7.770E-05 | global batch size: 512 | lm loss: 2.047568E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.272 | TFLOPs: 23.50 | 63: iteration 15170/ 24424 | consumed samples: 7767040 | consumed tokens: 15906897920 | elapsed time per iteration (s): 2.26 | learning rate: 7.759E-05 | global batch size: 512 | lm loss: 2.046681E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.080 | TFLOPs: 23.27 | 63: iteration 15180/ 24424 | consumed samples: 7772160 | consumed tokens: 15917383680 | elapsed time per iteration (s): 2.27 | learning rate: 7.748E-05 | global batch size: 512 | lm loss: 2.046950E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.846 | TFLOPs: 23.25 | 63: iteration 15190/ 24424 | consumed samples: 7777280 | consumed tokens: 15927869440 | elapsed time per iteration (s): 2.25 | learning rate: 7.737E-05 | global batch size: 512 | lm loss: 2.057847E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.990 | TFLOPs: 23.47 | 63: iteration 15200/ 24424 | consumed samples: 7782400 | consumed tokens: 15938355200 | elapsed time per iteration (s): 2.33 | learning rate: 7.726E-05 | global batch size: 512 | lm loss: 2.064738E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.658 | TFLOPs: 22.61 | 63: iteration 15210/ 24424 | consumed samples: 7787520 | consumed tokens: 15948840960 | elapsed time per iteration (s): 2.28 | learning rate: 7.715E-05 | global batch size: 512 | lm loss: 2.036128E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.803 | TFLOPs: 23.14 | 63: iteration 15220/ 24424 | consumed samples: 7792640 | consumed tokens: 15959326720 | elapsed time per iteration (s): 2.32 | learning rate: 7.704E-05 | global batch size: 512 | lm loss: 2.049530E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.651 | TFLOPs: 22.71 | 63: iteration 15230/ 24424 | consumed samples: 7797760 | consumed tokens: 15969812480 | elapsed time per iteration (s): 2.23 | learning rate: 7.694E-05 | global batch size: 512 | lm loss: 2.028319E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.917 | TFLOPs: 23.67 | 63: iteration 15240/ 24424 | consumed samples: 7802880 | consumed tokens: 15980298240 | elapsed time per iteration (s): 2.25 | learning rate: 7.683E-05 | global batch size: 512 | lm loss: 2.049186E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.868 | TFLOPs: 23.46 | 63: iteration 15250/ 24424 | consumed samples: 7808000 | consumed tokens: 15990784000 | elapsed time per iteration (s): 2.23 | learning rate: 7.672E-05 | global batch size: 512 | lm loss: 2.044222E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.152 | TFLOPs: 23.59 | 63: iteration 15260/ 24424 | consumed samples: 7813120 | consumed tokens: 16001269760 | elapsed time per iteration (s): 2.24 | learning rate: 7.661E-05 | global batch size: 512 | lm loss: 2.048525E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.307 | TFLOPs: 23.50 | 63: iteration 15270/ 24424 | consumed samples: 7818240 | consumed tokens: 16011755520 | elapsed time per iteration (s): 2.33 | learning rate: 7.650E-05 | global batch size: 512 | lm loss: 2.026237E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.846 | TFLOPs: 22.63 | 63: iteration 15280/ 24424 | consumed samples: 7823360 | consumed tokens: 16022241280 | elapsed time per iteration (s): 2.25 | learning rate: 7.639E-05 | global batch size: 512 | lm loss: 2.062573E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.521 | TFLOPs: 23.42 | 63: iteration 15290/ 24424 | consumed samples: 7828480 | consumed tokens: 16032727040 | elapsed time per iteration (s): 2.23 | learning rate: 7.628E-05 | global batch size: 512 | lm loss: 2.044133E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.196 | TFLOPs: 23.59 | 63: iteration 15300/ 24424 | consumed samples: 7833600 | consumed tokens: 16043212800 | elapsed time per iteration (s): 2.29 | learning rate: 7.618E-05 | global batch size: 512 | lm loss: 2.053091E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.972 | TFLOPs: 23.06 | 63: iteration 15310/ 24424 | consumed samples: 7838720 | consumed tokens: 16053698560 | elapsed time per iteration (s): 2.30 | learning rate: 7.607E-05 | global batch size: 512 | lm loss: 2.045592E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.236 | TFLOPs: 22.88 | 63: iteration 15320/ 24424 | consumed samples: 7843840 | consumed tokens: 16064184320 | elapsed time per iteration (s): 2.43 | learning rate: 7.596E-05 | global batch size: 512 | lm loss: 2.025595E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 210.738 | TFLOPs: 21.69 | 63: iteration 15330/ 24424 | consumed samples: 7848960 | consumed tokens: 16074670080 | elapsed time per iteration (s): 2.27 | learning rate: 7.585E-05 | global batch size: 512 | lm loss: 2.048975E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.138 | TFLOPs: 23.18 | 63: iteration 15340/ 24424 | consumed samples: 7854080 | consumed tokens: 16085155840 | elapsed time per iteration (s): 8.63 | learning rate: 7.574E-05 | global batch size: 512 | lm loss: 2.073791E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 59.330 | TFLOPs: 6.11 | 63: iteration 15350/ 24424 | consumed samples: 7859200 | consumed tokens: 16095641600 | elapsed time per iteration (s): 2.24 | learning rate: 7.563E-05 | global batch size: 512 | lm loss: 2.063936E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.994 | TFLOPs: 23.57 | 63: iteration 15360/ 24424 | consumed samples: 7864320 | consumed tokens: 16106127360 | elapsed time per iteration (s): 2.24 | learning rate: 7.553E-05 | global batch size: 512 | lm loss: 2.040764E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.569 | TFLOPs: 23.53 | 63: iteration 15370/ 24424 | consumed samples: 7869440 | consumed tokens: 16116613120 | elapsed time per iteration (s): 2.24 | learning rate: 7.542E-05 | global batch size: 512 | lm loss: 2.037105E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.435 | TFLOPs: 23.52 | 63: iteration 15380/ 24424 | consumed samples: 7874560 | consumed tokens: 16127098880 | elapsed time per iteration (s): 2.23 | learning rate: 7.531E-05 | global batch size: 512 | lm loss: 2.039688E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.259 | TFLOPs: 23.60 | 63: iteration 15390/ 24424 | consumed samples: 7879680 | consumed tokens: 16137584640 | elapsed time per iteration (s): 2.23 | learning rate: 7.520E-05 | global batch size: 512 | lm loss: 2.035775E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.795 | TFLOPs: 23.66 | 63: iteration 15400/ 24424 | consumed samples: 7884800 | consumed tokens: 16148070400 | elapsed time per iteration (s): 2.26 | learning rate: 7.510E-05 | global batch size: 512 | lm loss: 2.026790E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.566 | TFLOPs: 23.32 | 63: iteration 15410/ 24424 | consumed samples: 7889920 | consumed tokens: 16158556160 | elapsed time per iteration (s): 2.26 | learning rate: 7.499E-05 | global batch size: 512 | lm loss: 2.034382E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.913 | TFLOPs: 23.36 | 63: iteration 15420/ 24424 | consumed samples: 7895040 | consumed tokens: 16169041920 | elapsed time per iteration (s): 2.24 | learning rate: 7.488E-05 | global batch size: 512 | lm loss: 2.044248E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.036 | TFLOPs: 23.58 | 63: iteration 15430/ 24424 | consumed samples: 7900160 | consumed tokens: 16179527680 | elapsed time per iteration (s): 2.25 | learning rate: 7.477E-05 | global batch size: 512 | lm loss: 2.043659E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.373 | TFLOPs: 23.41 | 63: iteration 15440/ 24424 | consumed samples: 7905280 | consumed tokens: 16190013440 | elapsed time per iteration (s): 2.24 | learning rate: 7.466E-05 | global batch size: 512 | lm loss: 2.044558E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.043 | TFLOPs: 23.58 | 63: iteration 15450/ 24424 | consumed samples: 7910400 | consumed tokens: 16200499200 | elapsed time per iteration (s): 2.25 | learning rate: 7.456E-05 | global batch size: 512 | lm loss: 2.036744E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.778 | TFLOPs: 23.45 | 63: iteration 15460/ 24424 | consumed samples: 7915520 | consumed tokens: 16210984960 | elapsed time per iteration (s): 2.32 | learning rate: 7.445E-05 | global batch size: 512 | lm loss: 2.040618E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.229 | TFLOPs: 22.67 | 63: iteration 15470/ 24424 | consumed samples: 7920640 | consumed tokens: 16221470720 | elapsed time per iteration (s): 2.24 | learning rate: 7.434E-05 | global batch size: 512 | lm loss: 2.043182E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.659 | TFLOPs: 23.54 | 63: iteration 15480/ 24424 | consumed samples: 7925760 | consumed tokens: 16231956480 | elapsed time per iteration (s): 2.25 | learning rate: 7.424E-05 | global batch size: 512 | lm loss: 2.041613E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.370 | TFLOPs: 23.41 | 63: iteration 15490/ 24424 | consumed samples: 7930880 | consumed tokens: 16242442240 | elapsed time per iteration (s): 2.25 | learning rate: 7.413E-05 | global batch size: 512 | lm loss: 2.030110E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.137 | TFLOPs: 23.38 | 63: iteration 15500/ 24424 | consumed samples: 7936000 | consumed tokens: 16252928000 | elapsed time per iteration (s): 2.23 | learning rate: 7.402E-05 | global batch size: 512 | lm loss: 2.043092E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.955 | TFLOPs: 23.67 | 63: iteration 15510/ 24424 | consumed samples: 7941120 | consumed tokens: 16263413760 | elapsed time per iteration (s): 2.24 | learning rate: 7.391E-05 | global batch size: 512 | lm loss: 2.072861E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.765 | TFLOPs: 23.55 | 63: iteration 15520/ 24424 | consumed samples: 7946240 | consumed tokens: 16273899520 | elapsed time per iteration (s): 2.26 | learning rate: 7.381E-05 | global batch size: 512 | lm loss: 2.070146E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.348 | TFLOPs: 23.30 | 63: iteration 15530/ 24424 | consumed samples: 7951360 | consumed tokens: 16284385280 | elapsed time per iteration (s): 2.26 | learning rate: 7.370E-05 | global batch size: 512 | lm loss: 2.033436E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.941 | TFLOPs: 23.36 | 63: iteration 15540/ 24424 | consumed samples: 7956480 | consumed tokens: 16294871040 | elapsed time per iteration (s): 2.24 | learning rate: 7.359E-05 | global batch size: 512 | lm loss: 2.070252E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.070 | TFLOPs: 23.48 | 63: iteration 15550/ 24424 | consumed samples: 7961600 | consumed tokens: 16305356800 | elapsed time per iteration (s): 2.25 | learning rate: 7.349E-05 | global batch size: 512 | lm loss: 2.048821E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.166 | TFLOPs: 23.39 | 63: iteration 15560/ 24424 | consumed samples: 7966720 | consumed tokens: 16315842560 | elapsed time per iteration (s): 2.63 | learning rate: 7.338E-05 | global batch size: 512 | lm loss: 2.045161E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 194.422 | TFLOPs: 20.01 | 63: iteration 15570/ 24424 | consumed samples: 7971840 | consumed tokens: 16326328320 | elapsed time per iteration (s): 2.25 | learning rate: 7.327E-05 | global batch size: 512 | lm loss: 2.057260E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.850 | TFLOPs: 23.46 | 63: iteration 15580/ 24424 | consumed samples: 7976960 | consumed tokens: 16336814080 | elapsed time per iteration (s): 2.23 | learning rate: 7.317E-05 | global batch size: 512 | lm loss: 2.031142E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.292 | TFLOPs: 23.60 | 63: iteration 15590/ 24424 | consumed samples: 7982080 | consumed tokens: 16347299840 | elapsed time per iteration (s): 2.23 | learning rate: 7.306E-05 | global batch size: 512 | lm loss: 2.050841E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.231 | TFLOPs: 23.60 | 63: iteration 15600/ 24424 | consumed samples: 7987200 | consumed tokens: 16357785600 | elapsed time per iteration (s): 2.24 | learning rate: 7.295E-05 | global batch size: 512 | lm loss: 2.039290E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.323 | TFLOPs: 23.50 | 63: iteration 15610/ 24424 | consumed samples: 7992320 | consumed tokens: 16368271360 | elapsed time per iteration (s): 2.25 | learning rate: 7.285E-05 | global batch size: 512 | lm loss: 2.048066E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.074 | TFLOPs: 23.38 | 63: iteration 15620/ 24424 | consumed samples: 7997440 | consumed tokens: 16378757120 | elapsed time per iteration (s): 2.27 | learning rate: 7.274E-05 | global batch size: 512 | lm loss: 2.047221E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.030 | TFLOPs: 23.27 | 63: iteration 15630/ 24424 | consumed samples: 8002560 | consumed tokens: 16389242880 | elapsed time per iteration (s): 2.25 | learning rate: 7.263E-05 | global batch size: 512 | lm loss: 2.024211E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.805 | TFLOPs: 23.45 | 63: iteration 15640/ 24424 | consumed samples: 8007680 | consumed tokens: 16399728640 | elapsed time per iteration (s): 2.27 | learning rate: 7.253E-05 | global batch size: 512 | lm loss: 2.016598E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.820 | TFLOPs: 23.25 | 63: iteration 15650/ 24424 | consumed samples: 8012800 | consumed tokens: 16410214400 | elapsed time per iteration (s): 2.25 | learning rate: 7.242E-05 | global batch size: 512 | lm loss: 2.030813E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.116 | TFLOPs: 23.38 | 63: iteration 15660/ 24424 | consumed samples: 8017920 | consumed tokens: 16420700160 | elapsed time per iteration (s): 2.26 | learning rate: 7.231E-05 | global batch size: 512 | lm loss: 2.033232E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.469 | TFLOPs: 23.31 | 63: iteration 15670/ 24424 | consumed samples: 8023040 | consumed tokens: 16431185920 | elapsed time per iteration (s): 2.30 | learning rate: 7.221E-05 | global batch size: 512 | lm loss: 2.043675E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.430 | TFLOPs: 22.90 | 63: iteration 15680/ 24424 | consumed samples: 8028160 | consumed tokens: 16441671680 | elapsed time per iteration (s): 2.29 | learning rate: 7.210E-05 | global batch size: 512 | lm loss: 2.044519E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.350 | TFLOPs: 22.99 | 63: iteration 15690/ 24424 | consumed samples: 8033280 | consumed tokens: 16452157440 | elapsed time per iteration (s): 2.25 | learning rate: 7.200E-05 | global batch size: 512 | lm loss: 2.024326E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.996 | TFLOPs: 23.47 | 63: iteration 15700/ 24424 | consumed samples: 8038400 | consumed tokens: 16462643200 | elapsed time per iteration (s): 2.25 | learning rate: 7.189E-05 | global batch size: 512 | lm loss: 2.046521E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.791 | TFLOPs: 23.45 | 63: iteration 15710/ 24424 | consumed samples: 8043520 | consumed tokens: 16473128960 | elapsed time per iteration (s): 2.25 | learning rate: 7.178E-05 | global batch size: 512 | lm loss: 2.031978E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.823 | TFLOPs: 23.45 | 63: iteration 15720/ 24424 | consumed samples: 8048640 | consumed tokens: 16483614720 | elapsed time per iteration (s): 2.23 | learning rate: 7.168E-05 | global batch size: 512 | lm loss: 2.052668E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.279 | TFLOPs: 23.60 | 63: iteration 15730/ 24424 | consumed samples: 8053760 | consumed tokens: 16494100480 | elapsed time per iteration (s): 2.25 | learning rate: 7.157E-05 | global batch size: 512 | lm loss: 2.043652E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.852 | TFLOPs: 23.46 | 63: iteration 15740/ 24424 | consumed samples: 8058880 | consumed tokens: 16504586240 | elapsed time per iteration (s): 2.24 | learning rate: 7.147E-05 | global batch size: 512 | lm loss: 2.059577E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.247 | TFLOPs: 23.50 | 63: iteration 15750/ 24424 | consumed samples: 8064000 | consumed tokens: 16515072000 | elapsed time per iteration (s): 2.58 | learning rate: 7.136E-05 | global batch size: 512 | lm loss: 2.029980E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 198.372 | TFLOPs: 20.42 | 63: iteration 15760/ 24424 | consumed samples: 8069120 | consumed tokens: 16525557760 | elapsed time per iteration (s): 2.25 | learning rate: 7.125E-05 | global batch size: 512 | lm loss: 2.038991E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.241 | TFLOPs: 23.39 | 63: iteration 15770/ 24424 | consumed samples: 8074240 | consumed tokens: 16536043520 | elapsed time per iteration (s): 2.26 | learning rate: 7.115E-05 | global batch size: 512 | lm loss: 2.026206E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.098 | TFLOPs: 23.28 | 63: iteration 15780/ 24424 | consumed samples: 8079360 | consumed tokens: 16546529280 | elapsed time per iteration (s): 2.24 | learning rate: 7.104E-05 | global batch size: 512 | lm loss: 2.042180E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.465 | TFLOPs: 23.52 | 63: iteration 15790/ 24424 | consumed samples: 8084480 | consumed tokens: 16557015040 | elapsed time per iteration (s): 2.25 | learning rate: 7.094E-05 | global batch size: 512 | lm loss: 2.039440E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.280 | TFLOPs: 23.40 | 63: iteration 15800/ 24424 | consumed samples: 8089600 | consumed tokens: 16567500800 | elapsed time per iteration (s): 2.24 | learning rate: 7.083E-05 | global batch size: 512 | lm loss: 2.021875E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.242 | TFLOPs: 23.50 | 63: iteration 15810/ 24424 | consumed samples: 8094720 | consumed tokens: 16577986560 | elapsed time per iteration (s): 2.26 | learning rate: 7.073E-05 | global batch size: 512 | lm loss: 2.034256E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.379 | TFLOPs: 23.30 | 63: iteration 15820/ 24424 | consumed samples: 8099840 | consumed tokens: 16588472320 | elapsed time per iteration (s): 2.23 | learning rate: 7.062E-05 | global batch size: 512 | lm loss: 2.041600E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.617 | TFLOPs: 23.64 | 63: iteration 15830/ 24424 | consumed samples: 8104960 | consumed tokens: 16598958080 | elapsed time per iteration (s): 2.25 | learning rate: 7.052E-05 | global batch size: 512 | lm loss: 2.031949E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.109 | TFLOPs: 23.38 | 63: iteration 15840/ 24424 | consumed samples: 8110080 | consumed tokens: 16609443840 | elapsed time per iteration (s): 2.25 | learning rate: 7.041E-05 | global batch size: 512 | lm loss: 2.038341E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.871 | TFLOPs: 23.46 | 63: iteration 15850/ 24424 | consumed samples: 8115200 | consumed tokens: 16619929600 | elapsed time per iteration (s): 2.23 | learning rate: 7.031E-05 | global batch size: 512 | lm loss: 2.033030E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.032 | TFLOPs: 23.68 | 63: iteration 15860/ 24424 | consumed samples: 8120320 | consumed tokens: 16630415360 | elapsed time per iteration (s): 2.24 | learning rate: 7.020E-05 | global batch size: 512 | lm loss: 2.048897E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.628 | TFLOPs: 23.54 | 63: iteration 15870/ 24424 | consumed samples: 8125440 | consumed tokens: 16640901120 | elapsed time per iteration (s): 2.25 | learning rate: 7.010E-05 | global batch size: 512 | lm loss: 2.055455E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.799 | TFLOPs: 23.45 | 63: iteration 15880/ 24424 | consumed samples: 8130560 | consumed tokens: 16651386880 | elapsed time per iteration (s): 2.35 | learning rate: 6.999E-05 | global batch size: 512 | lm loss: 2.045125E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 217.913 | TFLOPs: 22.43 | 63: iteration 15890/ 24424 | consumed samples: 8135680 | consumed tokens: 16661872640 | elapsed time per iteration (s): 2.23 | learning rate: 6.989E-05 | global batch size: 512 | lm loss: 2.039028E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.639 | TFLOPs: 23.64 | 63: iteration 15900/ 24424 | consumed samples: 8140800 | consumed tokens: 16672358400 | elapsed time per iteration (s): 2.25 | learning rate: 6.978E-05 | global batch size: 512 | lm loss: 2.043979E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.955 | TFLOPs: 23.47 | 63: iteration 15910/ 24424 | consumed samples: 8145920 | consumed tokens: 16682844160 | elapsed time per iteration (s): 2.26 | learning rate: 6.968E-05 | global batch size: 512 | lm loss: 2.035170E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.580 | TFLOPs: 23.33 | 63: iteration 15920/ 24424 | consumed samples: 8151040 | consumed tokens: 16693329920 | elapsed time per iteration (s): 2.26 | learning rate: 6.957E-05 | global batch size: 512 | lm loss: 2.047408E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.273 | TFLOPs: 23.29 | 63: iteration 15930/ 24424 | consumed samples: 8156160 | consumed tokens: 16703815680 | elapsed time per iteration (s): 2.25 | learning rate: 6.947E-05 | global batch size: 512 | lm loss: 2.023063E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.864 | TFLOPs: 23.46 | 63: iteration 15940/ 24424 | consumed samples: 8161280 | consumed tokens: 16714301440 | elapsed time per iteration (s): 2.25 | learning rate: 6.937E-05 | global batch size: 512 | lm loss: 2.032621E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.842 | TFLOPs: 23.46 | 63: iteration 15950/ 24424 | consumed samples: 8166400 | consumed tokens: 16724787200 | elapsed time per iteration (s): 2.23 | learning rate: 6.926E-05 | global batch size: 512 | lm loss: 2.033375E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.107 | TFLOPs: 23.59 | 63: iteration 15960/ 24424 | consumed samples: 8171520 | consumed tokens: 16735272960 | elapsed time per iteration (s): 2.28 | learning rate: 6.916E-05 | global batch size: 512 | lm loss: 2.025964E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.780 | TFLOPs: 23.14 | 63: iteration 15970/ 24424 | consumed samples: 8176640 | consumed tokens: 16745758720 | elapsed time per iteration (s): 2.22 | learning rate: 6.905E-05 | global batch size: 512 | lm loss: 2.042215E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.115 | TFLOPs: 23.69 | 63: iteration 15980/ 24424 | consumed samples: 8181760 | consumed tokens: 16756244480 | elapsed time per iteration (s): 2.23 | learning rate: 6.895E-05 | global batch size: 512 | lm loss: 2.011649E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.008 | TFLOPs: 23.68 | 63: iteration 15990/ 24424 | consumed samples: 8186880 | consumed tokens: 16766730240 | elapsed time per iteration (s): 2.23 | learning rate: 6.885E-05 | global batch size: 512 | lm loss: 2.043206E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.897 | TFLOPs: 23.67 | 0: [2022-11-26 04:06:54,227] [INFO] [logging.py:68:log_dist] [Rank 0] step=16000, skipped=0, lr=[6.874118404023573e-05, 6.874118404023573e-05, 6.874118404023573e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 16000/ 24424 | consumed samples: 8192000 | consumed tokens: 16777216000 | elapsed time per iteration (s): 2.23 | learning rate: 6.874E-05 | global batch size: 512 | lm loss: 2.024591E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.187 | TFLOPs: 23.59 | 0: steps: 16000 loss: 2.0001 iter time (s): 2.326 samples/sec: 220.130 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 16000 | lm loss value: 1.937160E+00 | lm loss PPL: 6.939020E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 16000 to checkpoints_3b9 0: [2022-11-26 04:06:54,991] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is begin to save! 0: [2022-11-26 04:06:55,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_01-model_00-model_states.pt... 32: [2022-11-26 04:06:55,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_21-model_00-model_states.pt... 32: [2022-11-26 04:06:55,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_21-model_00-model_states.pt. 32: [2022-11-26 04:06:55,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_22-model_00-model_states.pt... 0: [2022-11-26 04:06:55,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_01-model_00-model_states.pt. 0: [2022-11-26 04:06:55,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_03-model_00-model_states.pt... 32: [2022-11-26 04:06:55,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_22-model_00-model_states.pt. 32: [2022-11-26 04:06:55,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_23-model_00-model_states.pt... 0: [2022-11-26 04:06:55,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_03-model_00-model_states.pt. 0: [2022-11-26 04:06:55,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_04-model_00-model_states.pt... 32: [2022-11-26 04:06:55,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_23-model_00-model_states.pt. 32: [2022-11-26 04:06:55,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_24-model_00-model_states.pt... 0: [2022-11-26 04:06:55,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_04-model_00-model_states.pt. 0: [2022-11-26 04:06:55,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_05-model_00-model_states.pt... 32: [2022-11-26 04:06:55,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_24-model_00-model_states.pt. 32: [2022-11-26 04:06:55,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_25-model_00-model_states.pt... 0: [2022-11-26 04:06:56,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_05-model_00-model_states.pt. 0: [2022-11-26 04:06:56,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_06-model_00-model_states.pt... 32: [2022-11-26 04:06:56,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_25-model_00-model_states.pt. 32: [2022-11-26 04:06:56,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_26-model_00-model_states.pt... 0: [2022-11-26 04:06:56,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_06-model_00-model_states.pt. 0: [2022-11-26 04:06:56,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_07-model_00-model_states.pt... 32: [2022-11-26 04:06:56,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_26-model_00-model_states.pt. 32: [2022-11-26 04:06:56,473] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_27-model_00-model_states.pt... 0: [2022-11-26 04:06:56,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_07-model_00-model_states.pt. 0: [2022-11-26 04:06:56,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_08-model_00-model_states.pt... 32: [2022-11-26 04:06:56,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_27-model_00-model_states.pt. 32: [2022-11-26 04:06:56,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_28-model_00-model_states.pt... 0: [2022-11-26 04:06:56,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_08-model_00-model_states.pt. 0: [2022-11-26 04:06:56,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_09-model_00-model_states.pt... 32: [2022-11-26 04:06:56,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_28-model_00-model_states.pt. 32: [2022-11-26 04:06:56,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_29-model_00-model_states.pt... 0: [2022-11-26 04:06:57,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_09-model_00-model_states.pt. 0: [2022-11-26 04:06:57,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_10-model_00-model_states.pt... 32: [2022-11-26 04:06:57,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_29-model_00-model_states.pt. 32: [2022-11-26 04:06:57,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_30-model_00-model_states.pt... 0: [2022-11-26 04:06:57,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_10-model_00-model_states.pt. 0: [2022-11-26 04:06:57,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_11-model_00-model_states.pt... 32: [2022-11-26 04:06:57,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_30-model_00-model_states.pt. 32: [2022-11-26 04:06:57,427] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_31-model_00-model_states.pt... 0: [2022-11-26 04:06:57,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_11-model_00-model_states.pt. 0: [2022-11-26 04:06:57,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_12-model_00-model_states.pt... 32: [2022-11-26 04:06:57,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_31-model_00-model_states.pt. 32: [2022-11-26 04:06:57,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_32-model_00-model_states.pt... 0: [2022-11-26 04:06:57,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_12-model_00-model_states.pt. 0: [2022-11-26 04:06:57,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_13-model_00-model_states.pt... 32: [2022-11-26 04:06:57,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_32-model_00-model_states.pt. 32: [2022-11-26 04:06:57,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_33-model_00-model_states.pt... 0: [2022-11-26 04:06:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_13-model_00-model_states.pt. 0: [2022-11-26 04:06:57,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_14-model_00-model_states.pt... 32: [2022-11-26 04:06:58,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_33-model_00-model_states.pt. 32: [2022-11-26 04:06:58,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_34-model_00-model_states.pt... 0: [2022-11-26 04:06:58,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_14-model_00-model_states.pt. 0: [2022-11-26 04:06:58,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_15-model_00-model_states.pt... 32: [2022-11-26 04:06:58,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_34-model_00-model_states.pt. 32: [2022-11-26 04:06:58,354] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_35-model_00-model_states.pt... 0: [2022-11-26 04:06:58,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_15-model_00-model_states.pt. 0: [2022-11-26 04:06:58,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_16-model_00-model_states.pt... 32: [2022-11-26 04:06:58,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_35-model_00-model_states.pt. 32: [2022-11-26 04:06:58,581] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_36-model_00-model_states.pt... 0: [2022-11-26 04:06:58,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_16-model_00-model_states.pt. 0: [2022-11-26 04:06:58,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_17-model_00-model_states.pt... 32: [2022-11-26 04:06:58,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_36-model_00-model_states.pt. 32: [2022-11-26 04:06:58,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_37-model_00-model_states.pt... 0: [2022-11-26 04:06:58,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_17-model_00-model_states.pt. 0: [2022-11-26 04:06:58,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_18-model_00-model_states.pt... 0: [2022-11-26 04:06:59,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_18-model_00-model_states.pt. 0: [2022-11-26 04:06:59,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_19-model_00-model_states.pt... 32: [2022-11-26 04:06:59,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_37-model_00-model_states.pt. 32: [2022-11-26 04:06:59,039] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_38-model_00-model_states.pt... 0: [2022-11-26 04:06:59,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_19-model_00-model_states.pt. 0: [2022-11-26 04:06:59,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_20-model_00-model_states.pt... 32: [2022-11-26 04:06:59,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_38-model_00-model_states.pt. 32: [2022-11-26 04:06:59,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/layer_40-model_00-model_states.pt... 32: [2022-11-26 04:06:59,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_40-model_00-model_states.pt. 32: [2022-11-26 04:06:59,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/mp_rank_01_model_states.pt... 32: [2022-11-26 04:06:59,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/mp_rank_01_model_states.pt. 0: [2022-11-26 04:06:59,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/layer_20-model_00-model_states.pt. 0: [2022-11-26 04:06:59,481] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step16000/mp_rank_00_model_states.pt 0: [2022-11-26 04:06:59,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/mp_rank_00_model_states.pt... 0: [2022-11-26 04:06:59,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/mp_rank_00_model_states.pt. 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 58: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 19: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:06:59,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step16000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:06:59,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 04:06:59,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:06:59,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:06:59,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 04:06:59,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:06:59,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:06:59,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 04:06:59,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 21: [2022-11-26 04:06:59,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:06:59,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:06:59,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 04:06:59,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:06:59,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:06:59,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 04:06:59,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:06:59,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 7: [2022-11-26 04:06:59,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 26: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 10: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:06:59,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 04:06:59,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:06:59,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:06:59,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 4: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:06:59,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:06:59,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:06:59,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:06:59,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:06:59,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:06:59,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:06:59,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 04:06:59,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:06:59,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:06:59,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 04:06:59,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:06:59,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:06:59,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 04:06:59,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:06:59,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:06:59,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 04:06:59,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:06:59,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 30: [2022-11-26 04:06:59,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 48: [2022-11-26 04:06:59,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:06:59,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 04:06:59,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:06:59,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 04:06:59,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 04:06:59,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 27: [2022-11-26 04:06:59,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:06:59,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 04:06:59,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:06:59,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 04:06:59,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:06:59,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:06:59,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:06:59,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:06:59,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 56: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 12: [2022-11-26 04:06:59,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:06:59,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:06:59,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:06:59,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 04:06:59,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 5: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:06:59,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:06:59,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 49: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 5: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:06:59,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 49: [2022-11-26 04:06:59,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 6: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 23: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 58: [2022-11-26 04:06:59,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 23: [2022-11-26 04:06:59,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 25: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 58: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-26 04:06:59,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 44: [2022-11-26 04:06:59,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 25: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:06:59,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:06:59,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 04:06:59,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:06:59,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 39: [2022-11-26 04:06:59,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 13: [2022-11-26 04:06:59,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 39: [2022-11-26 04:06:59,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 13: [2022-11-26 04:06:59,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:06:59,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:06:59,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:06:59,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:06:59,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 63: [2022-11-26 04:06:59,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 36: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:06:59,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:06:59,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:06:59,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 04:06:59,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:06:59,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:06:59,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:06:59,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:06:59,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 04:06:59,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:06:59,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 04:06:59,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:06:59,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:06:59,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:06:59,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 04:06:59,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 60: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 26: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 26: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 60: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 50: [2022-11-26 04:06:59,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 46: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 17: [2022-11-26 04:06:59,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:06:59,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 46: [2022-11-26 04:06:59,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 25: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:06:59,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:06:59,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 28: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 33: [2022-11-26 04:06:59,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 04:06:59,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 28: [2022-11-26 04:06:59,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 54: [2022-11-26 04:06:59,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 33: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 54: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 12: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 57: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 12: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 35: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 20: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 57: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 63: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 20: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 21: [2022-11-26 04:06:59,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:06:59,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:06:59,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 04:06:59,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:06:59,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 14: [2022-11-26 04:06:59,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 18: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:06:59,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:06:59,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:06:59,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 42: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 16: [2022-11-26 04:06:59,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:06:59,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 04:06:59,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:06:59,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:06:59,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 04:06:59,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 22: [2022-11-26 04:06:59,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 04:06:59,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:06:59,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:06:59,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 55: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 29: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 04:06:59,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:06:59,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:06:59,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 04:06:59,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:06:59,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:06:59,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 04:06:59,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:06:59,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:06:59,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:06:59,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 35: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 7: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:06:59,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:06:59,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:06:59,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 23: [2022-11-26 04:06:59,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 40: [2022-11-26 04:06:59,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 23: [2022-11-26 04:06:59,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 40: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:06:59,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:06:59,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 36: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:06:59,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 17: [2022-11-26 04:06:59,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 36: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:06:59,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:06:59,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:06:59,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 56: [2022-11-26 04:06:59,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 39: [2022-11-26 04:06:59,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:06:59,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:06:59,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 1: [2022-11-26 04:06:59,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:06:59,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 46: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 30: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 46: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:06:59,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 55: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:06:59,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 55: [2022-11-26 04:06:59,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 4: [2022-11-26 04:06:59,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:06:59,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 04:06:59,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:06:59,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 04:06:59,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:06:59,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:06:59,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 04:06:59,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:06:59,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 04:06:59,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:06:59,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 04:06:59,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 29: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 62: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:06:59,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 29: [2022-11-26 04:06:59,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 62: [2022-11-26 04:06:59,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 29: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 4: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 53: [2022-11-26 04:06:59,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 4: [2022-11-26 04:06:59,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 53: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:06:59,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:06:59,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 44: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:06:59,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:06:59,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 04:06:59,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:06:59,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:06:59,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 60: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:06:59,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:06:59,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 63: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 5: [2022-11-26 04:06:59,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:06:59,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 04:06:59,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:06:59,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:06:59,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 04:06:59,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:06:59,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:06:59,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 04:06:59,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:06:59,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:06:59,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 04:06:59,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:06:59,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:06:59,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 0: [2022-11-26 04:06:59,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 50: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:06:59,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 13: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:06:59,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:06:59,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:06:59,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:06:59,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:06:59,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 04:06:59,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:06:59,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:06:59,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 04:06:59,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:06:59,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:06:59,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 04:06:59,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:06:59,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:06:59,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 04:06:59,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:06:59,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:06:59,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 04:06:59,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:06:59,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:06:59,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 04:06:59,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:06:59,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:06:59,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:06:59,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 04:06:59,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 04:06:59,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:06:59,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 61: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 16: [2022-11-26 04:06:59,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:06:59,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:06:59,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:06:59,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 04:06:59,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 04:06:59,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:06:59,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:06:59,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 04:06:59,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:06:59,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:06:59,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:06:59,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 04:06:59,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:06:59,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:06:59,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:06:59,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:06:59,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:06:59,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:06:59,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 04:06:59,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:06:59,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:06:59,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 04:06:59,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:06:59,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:06:59,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 04:06:59,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:06:59,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:06:59,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:06:59,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 57: [2022-11-26 04:06:59,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 59: [2022-11-26 04:06:59,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:06:59,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:06:59,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 27: [2022-11-26 04:06:59,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:06:59,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 59: [2022-11-26 04:06:59,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 27: [2022-11-26 04:06:59,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:06:59,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:06:59,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:06:59,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 04:06:59,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:06:59,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:06:59,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 04:06:59,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:06:59,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:06:59,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 04:06:59,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:06:59,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:06:59,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 04:06:59,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:06:59,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:06:59,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 04:06:59,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:06:59,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:06:59,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 04:06:59,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:06:59,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:06:59,809] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 04:06:59,809] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:06:59,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:06:59,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 04:06:59,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:06:59,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 04:06:59,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:06:59,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 04:06:59,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:06:59,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:06:59,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 04:06:59,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:06:59,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:06:59,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:06:59,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 04:06:59,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:06:59,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:06:59,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 04:06:59,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:06:59,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 04:06:59,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:06:59,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:06:59,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:06:59,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:06:59,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 04:06:59,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:06:59,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 04:06:59,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:06:59,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:06:59,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 04:06:59,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:06:59,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:06:59,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 04:06:59,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:06:59,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:06:59,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 04:06:59,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:06:59,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:06:59,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 04:06:59,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 22: [2022-11-26 04:06:59,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 04:06:59,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:06:59,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 04:06:59,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:06:59,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:06:59,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 04:06:59,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:06:59,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 04:06:59,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:06:59,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:06:59,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 04:06:59,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:06:59,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:06:59,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:06:59,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 04:06:59,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:06:59,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:06:59,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 04:06:59,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:06:59,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:06:59,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:06:59,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:06:59,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 04:06:59,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:06:59,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:06:59,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 04:06:59,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:06:59,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:06:59,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 04:06:59,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 27: [2022-11-26 04:06:59,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:06:59,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 04:06:59,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:06:59,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 04:06:59,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:06:59,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:06:59,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 04:06:59,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:06:59,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:06:59,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 04:06:59,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 22: [2022-11-26 04:06:59,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 04:06:59,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:06:59,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:06:59,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:06:59,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:06:59,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 04:06:59,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:06:59,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:06:59,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 04:06:59,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:06:59,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:06:59,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 04:06:59,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:06:59,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 04:06:59,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:06:59,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:06:59,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 04:06:59,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:06:59,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:06:59,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 04:06:59,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 27: [2022-11-26 04:06:59,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:06:59,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 04:06:59,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 21: [2022-11-26 04:06:59,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:06:59,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 04:06:59,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:06:59,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:06:59,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 04:06:59,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 27: [2022-11-26 04:06:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:06:59,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 04:06:59,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:06:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:06:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 04:06:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:06:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:06:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 04:06:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:06:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:06:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 04:06:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:06:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:06:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 04:06:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:06:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:06:59,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 04:06:59,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:06:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:06:59,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:06:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 56: [2022-11-26 04:06:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 10: [2022-11-26 04:06:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:06:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 04:06:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:06:59,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:06:59,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 04:06:59,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:06:59,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:06:59,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 04:06:59,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:06:59,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:06:59,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:06:59,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 15: [2022-11-26 04:06:59,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:06:59,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:06:59,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:06:59,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 04:06:59,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:06:59,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:06:59,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 04:06:59,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:06:59,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:06:59,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 04:06:59,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:06:59,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:06:59,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:06:59,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 04:06:59,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:06:59,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:06:59,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 04:06:59,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:06:59,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:06:59,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:06:59,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:06:59,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 04:06:59,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 5: [2022-11-26 04:06:59,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:06:59,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 04:06:59,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 04:06:59,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 16: [2022-11-26 04:06:59,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:06:59,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 55: [2022-11-26 04:06:59,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 16: [2022-11-26 04:06:59,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:06:59,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:06:59,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 04:06:59,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:06:59,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 57: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 18: [2022-11-26 04:06:59,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:06:59,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 0: [2022-11-26 04:06:59,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 57: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:06:59,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:06:59,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:06:59,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 04:06:59,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:06:59,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:06:59,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 04:06:59,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:06:59,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:06:59,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 04:06:59,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:06:59,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:06:59,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 04:06:59,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:06:59,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:06:59,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 04:06:59,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:06:59,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 04:06:59,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:06:59,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:06:59,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 04:06:59,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:06:59,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 04:06:59,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 8: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 36: [2022-11-26 04:06:59,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-26 04:06:59,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 12: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 36: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:06:59,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:06:59,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:06:59,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:06:59,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 04:06:59,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:06:59,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:06:59,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 04:06:59,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:06:59,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:06:59,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 04:06:59,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 22: [2022-11-26 04:06:59,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 04:06:59,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:06:59,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:06:59,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 04:06:59,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:06:59,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:06:59,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 04:06:59,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:06:59,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:06:59,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:06:59,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:06:59,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:06:59,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:06:59,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:06:59,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 04:06:59,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:06:59,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:06:59,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 04:06:59,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:06:59,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:06:59,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 04:06:59,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:06:59,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:06:59,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 04:06:59,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:06:59,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:06:59,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:06:59,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:06:59,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 54: [2022-11-26 04:06:59,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 04:06:59,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 04:06:59,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:06:59,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:06:59,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 04:06:59,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:06:59,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:06:59,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 04:06:59,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:06:59,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 37: [2022-11-26 04:06:59,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:06:59,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 13: [2022-11-26 04:06:59,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 37: [2022-11-26 04:06:59,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:06:59,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:06:59,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:06:59,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:06:59,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:06:59,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:06:59,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 04:06:59,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:06:59,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 04:06:59,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:06:59,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 22: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 56: [2022-11-26 04:06:59,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 22: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 56: [2022-11-26 04:06:59,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 04:06:59,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:06:59,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:06:59,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 04:06:59,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:06:59,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:06:59,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 04:06:59,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:06:59,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:06:59,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 04:06:59,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:06:59,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:06:59,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 61: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 14: [2022-11-26 04:06:59,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:06:59,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:06:59,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 04:06:59,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:06:59,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:06:59,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 04:06:59,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:06:59,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:06:59,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 04:06:59,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:06:59,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:06:59,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 04:06:59,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:06:59,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:06:59,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 18: [2022-11-26 04:06:59,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 57: [2022-11-26 04:06:59,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:06:59,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 04:06:59,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:06:59,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:06:59,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 04:06:59,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:06:59,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:06:59,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 04:06:59,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:06:59,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:06:59,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 04:06:59,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:06:59,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:06:59,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 04:06:59,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:06:59,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:06:59,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 04:06:59,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:06:59,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:06:59,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 04:06:59,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:06:59,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 04:06:59,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:06:59,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:06:59,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 04:06:59,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:06:59,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:06:59,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 04:06:59,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:06:59,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:06:59,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 04:06:59,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:06:59,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:06:59,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 04:06:59,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:06:59,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:06:59,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 04:06:59,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:06:59,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:06:59,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 04:06:59,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:06:59,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 04:06:59,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:06:59,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:06:59,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:06:59,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 27: [2022-11-26 04:06:59,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 31: [2022-11-26 04:06:59,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 27: [2022-11-26 04:06:59,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:06:59,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:06:59,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 04:06:59,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:06:59,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:06:59,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 04:06:59,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:06:59,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:06:59,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 04:06:59,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:06:59,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:06:59,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 04:06:59,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:06:59,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:06:59,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 04:06:59,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:06:59,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:06:59,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 04:06:59,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:06:59,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:06:59,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 04:06:59,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:06:59,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:06:59,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:06:59,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 43: [2022-11-26 04:06:59,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 33: [2022-11-26 04:06:59,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:06:59,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:06:59,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:06:59,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 04:06:59,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:06:59,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:06:59,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 04:06:59,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:06:59,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:06:59,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 04:06:59,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:06:59,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:06:59,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 04:06:59,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:06:59,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:06:59,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 04:06:59,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:06:59,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:06:59,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 04:06:59,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:06:59,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:06:59,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 04:06:59,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:06:59,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:06:59,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:06:59,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 59: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:06:59,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 53: [2022-11-26 04:06:59,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 59: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:06:59,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 04:06:59,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 27: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 37: [2022-11-26 04:06:59,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 27: [2022-11-26 04:06:59,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:06:59,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:06:59,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:06:59,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:06:59,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:06:59,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 04:06:59,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:06:59,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:06:59,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 04:06:59,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 21: [2022-11-26 04:06:59,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:06:59,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 04:06:59,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:06:59,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:06:59,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 04:06:59,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:06:59,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:06:59,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 04:06:59,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:06:59,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:06:59,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 04:06:59,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:06:59,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:06:59,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 04:06:59,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 22: [2022-11-26 04:06:59,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:06:59,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 04:06:59,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:06:59,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:06:59,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 04:06:59,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:06:59,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:06:59,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 04:06:59,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:07:00,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:07:00,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 04:07:00,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:07:00,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:07:00,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 04:07:00,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:07:00,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:07:00,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 04:07:00,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:07:00,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:07:00,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 04:07:00,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:07:00,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:07:00,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 04:07:00,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:07:00,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:07:00,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 04:07:00,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:07:00,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:07:00,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 04:07:00,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:07:00,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:07:00,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 04:07:00,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:07:00,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:07:00,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 04:07:00,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:07:00,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:07:00,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 04:07:00,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:07:00,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:07:00,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 04:07:00,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:07:00,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:07:00,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 04:07:00,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 42: [2022-11-26 04:07:00,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:07:00,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 04:07:00,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:07:00,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:07:00,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:07:00,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:07:00,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:07:00,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 04:07:00,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:07:00,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:07:00,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 04:07:00,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:07:00,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:07:00,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 04:07:00,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:07:00,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 19: [2022-11-26 04:07:00,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 34: [2022-11-26 04:07:00,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 04:07:00,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:07:00,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 04:07:00,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:07:00,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:07:00,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 04:07:00,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:07:00,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:07:00,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 04:07:00,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:07:00,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:07:00,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 12: [2022-11-26 04:07:00,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 43: [2022-11-26 04:07:00,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:07:00,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 04:07:00,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:07:00,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:07:00,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 04:07:00,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:07:00,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:07:00,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 04:07:00,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:07:00,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:07:00,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 04:07:00,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:07:00,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:07:00,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 04:07:00,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:07:00,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:07:00,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 04:07:00,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:07:00,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:07:00,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 04:07:00,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:07:00,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:07:00,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 04:07:00,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:07:00,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:07:00,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 04:07:00,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:07:00,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:07:00,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 04:07:00,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:07:00,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:07:00,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 04:07:00,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:07:00,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:07:00,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 04:07:00,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:07:00,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:07:00,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 04:07:00,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:07:00,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:07:00,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 04:07:00,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:07:00,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:07:00,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 04:07:00,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:07:00,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:07:00,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 04:07:00,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:07:00,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:07:00,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 04:07:00,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:07:00,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:07:00,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 04:07:00,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:07:00,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:07:00,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 04:07:00,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 9: [2022-11-26 04:07:00,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:07:00,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 04:07:00,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:07:00,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:07:00,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 04:07:00,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:07:00,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:07:00,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:07:00,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 04:07:00,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:07:00,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 04:07:00,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 17: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 53: [2022-11-26 04:07:00,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:07:00,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 4: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:07:00,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 22: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 22: [2022-11-26 04:07:00,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 54: [2022-11-26 04:07:00,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 22: [2022-11-26 04:07:00,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 27: [2022-11-26 04:07:00,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:07:00,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 04:07:00,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 10: [2022-11-26 04:07:00,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:07:00,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 04:07:00,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:07:00,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:07:00,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 04:07:00,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 28: [2022-11-26 04:07:00,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:07:00,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 04:07:00,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:07:00,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 04:07:00,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:07:00,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 61: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:07:00,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 3: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:07:00,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 20: [2022-11-26 04:07:00,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 3: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:07:00,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 6: [2022-11-26 04:07:00,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:07:00,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 04:07:00,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: [2022-11-26 04:07:00,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:07:00,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 04:07:00,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:07:00,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:07:00,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 04:07:00,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 21: [2022-11-26 04:07:00,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:07:00,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 04:07:00,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 30: [2022-11-26 04:07:00,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:07:00,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 04:07:00,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 5: [2022-11-26 04:07:00,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:07:00,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 04:07:00,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 32: [2022-11-26 04:07:00,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:07:00,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:07:00,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 04:07:00,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 34: [2022-11-26 04:07:00,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 04:07:00,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:07:00,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:07:00,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 04:07:00,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:07:00,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 42: [2022-11-26 04:07:00,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 33: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 23: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:07:00,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 40: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 14: [2022-11-26 04:07:00,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 23: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:07:00,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 43: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:07:00,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 04:07:00,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 56: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 7: [2022-11-26 04:07:00,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 56: [2022-11-26 04:07:00,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 7: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:07:00,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 2: [2022-11-26 04:07:00,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 16: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 2: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:07:00,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 45: [2022-11-26 04:07:00,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 51: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:07:00,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 25: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 63: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 25: [2022-11-26 04:07:00,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 24: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 63: [2022-11-26 04:07:00,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 25: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 63: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:07:00,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 50: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 24: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 50: [2022-11-26 04:07:00,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 15: [2022-11-26 04:07:00,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 50: [2022-11-26 04:07:00,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 62: [2022-11-26 04:07:00,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:07:00,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 57: [2022-11-26 04:07:00,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:07:00,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:07:00,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:07:00,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 44: [2022-11-26 04:07:00,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 57: [2022-11-26 04:07:00,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 44: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 26: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:07:00,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 58: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 18: [2022-11-26 04:07:00,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 58: [2022-11-26 04:07:00,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 04:07:00,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 60: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:07:00,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 31: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 60: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 31: [2022-11-26 04:07:00,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 55: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:07:00,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 8: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:07:00,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 04:07:00,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 12: [2022-11-26 04:07:00,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:07:00,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 04:07:00,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:07:00,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:07:00,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 04:07:00,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 53: [2022-11-26 04:07:00,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:07:00,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:07:00,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 51: [2022-11-26 04:07:00,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 53: [2022-11-26 04:07:00,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 51: [2022-11-26 04:07:00,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 38: [2022-11-26 04:07:00,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:07:00,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 04:07:00,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 39: [2022-11-26 04:07:00,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:07:00,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 04:07:00,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 57: [2022-11-26 04:07:00,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:07:00,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 04:07:00,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 13: [2022-11-26 04:07:00,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:07:00,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 04:07:00,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 16: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:07:00,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 15: [2022-11-26 04:07:00,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 52: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 15: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 52: [2022-11-26 04:07:00,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 40: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:07:00,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 04:07:00,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 47: [2022-11-26 04:07:00,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:07:00,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 04:07:00,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 48: [2022-11-26 04:07:00,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:07:00,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:07:00,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 04:07:00,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 37: [2022-11-26 04:07:00,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 04:07:00,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 46: [2022-11-26 04:07:00,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:07:00,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 04:07:00,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 41: [2022-11-26 04:07:00,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:07:00,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 04:07:00,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 18: [2022-11-26 04:07:00,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:07:00,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 04:07:00,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 29: [2022-11-26 04:07:00,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:07:00,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 04:07:00,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:07:00,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:07:00,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:07:00,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 1: [2022-11-26 04:07:00,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 04:07:00,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 14: [2022-11-26 04:07:00,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 35: [2022-11-26 04:07:00,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:07:00,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 04:07:00,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 20: [2022-11-26 04:07:00,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:07:00,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 04:07:00,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 56: [2022-11-26 04:07:00,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:07:00,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 04:07:00,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 54: [2022-11-26 04:07:00,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:07:00,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 04:07:00,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 45: [2022-11-26 04:07:00,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:07:00,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 04:07:00,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:07:00,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 04:07:00,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 17: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 1: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 49: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 1: [2022-11-26 04:07:00,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 49: [2022-11-26 04:07:00,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 1: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:07:00,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 24: [2022-11-26 04:07:00,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:07:00,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 04:07:00,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 59: [2022-11-26 04:07:00,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:07:00,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 04:07:00,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 49: [2022-11-26 04:07:00,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:07:00,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 04:07:00,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 19: [2022-11-26 04:07:00,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:07:00,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 04:07:00,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 36: [2022-11-26 04:07:00,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:07:00,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 04:07:00,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 11: [2022-11-26 04:07:00,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:07:00,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step16000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 04:07:00,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! 0: successfully saved checkpoint at iteration 16000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5181.06 63: iteration 16010/ 24424 | consumed samples: 8197120 | consumed tokens: 16787701760 | elapsed time per iteration (s): 2.82 | learning rate: 6.864E-05 | global batch size: 512 | lm loss: 2.024463E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.341 | TFLOPs: 18.67 | 63: iteration 16020/ 24424 | consumed samples: 8202240 | consumed tokens: 16798187520 | elapsed time per iteration (s): 2.26 | learning rate: 6.853E-05 | global batch size: 512 | lm loss: 2.051615E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.550 | TFLOPs: 23.32 | 63: iteration 16030/ 24424 | consumed samples: 8207360 | consumed tokens: 16808673280 | elapsed time per iteration (s): 2.23 | learning rate: 6.843E-05 | global batch size: 512 | lm loss: 2.045178E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.266 | TFLOPs: 23.60 | 63: iteration 16040/ 24424 | consumed samples: 8212480 | consumed tokens: 16819159040 | elapsed time per iteration (s): 2.25 | learning rate: 6.833E-05 | global batch size: 512 | lm loss: 2.047999E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.059 | TFLOPs: 23.48 | 63: iteration 16050/ 24424 | consumed samples: 8217600 | consumed tokens: 16829644800 | elapsed time per iteration (s): 2.25 | learning rate: 6.822E-05 | global batch size: 512 | lm loss: 2.048445E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.043 | TFLOPs: 23.48 | 63: iteration 16060/ 24424 | consumed samples: 8222720 | consumed tokens: 16840130560 | elapsed time per iteration (s): 2.67 | learning rate: 6.812E-05 | global batch size: 512 | lm loss: 2.032447E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 191.765 | TFLOPs: 19.74 | 63: iteration 16070/ 24424 | consumed samples: 8227840 | consumed tokens: 16850616320 | elapsed time per iteration (s): 2.23 | learning rate: 6.802E-05 | global batch size: 512 | lm loss: 2.039657E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.960 | TFLOPs: 23.67 | 63: iteration 16080/ 24424 | consumed samples: 8232960 | consumed tokens: 16861102080 | elapsed time per iteration (s): 2.24 | learning rate: 6.791E-05 | global batch size: 512 | lm loss: 2.039209E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.965 | TFLOPs: 23.57 | 63: iteration 16090/ 24424 | consumed samples: 8238080 | consumed tokens: 16871587840 | elapsed time per iteration (s): 2.25 | learning rate: 6.781E-05 | global batch size: 512 | lm loss: 2.024064E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.265 | TFLOPs: 23.40 | 63: iteration 16100/ 24424 | consumed samples: 8243200 | consumed tokens: 16882073600 | elapsed time per iteration (s): 2.23 | learning rate: 6.771E-05 | global batch size: 512 | lm loss: 2.039941E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.486 | TFLOPs: 23.62 | 63: iteration 16110/ 24424 | consumed samples: 8248320 | consumed tokens: 16892559360 | elapsed time per iteration (s): 2.23 | learning rate: 6.760E-05 | global batch size: 512 | lm loss: 2.031740E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.824 | TFLOPs: 23.66 | 63: iteration 16120/ 24424 | consumed samples: 8253440 | consumed tokens: 16903045120 | elapsed time per iteration (s): 2.23 | learning rate: 6.750E-05 | global batch size: 512 | lm loss: 2.036521E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.158 | TFLOPs: 23.59 | 63: iteration 16130/ 24424 | consumed samples: 8258560 | consumed tokens: 16913530880 | elapsed time per iteration (s): 2.23 | learning rate: 6.740E-05 | global batch size: 512 | lm loss: 2.073830E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.854 | TFLOPs: 23.66 | 63: iteration 16140/ 24424 | consumed samples: 8263680 | consumed tokens: 16924016640 | elapsed time per iteration (s): 2.27 | learning rate: 6.729E-05 | global batch size: 512 | lm loss: 2.031300E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.070 | TFLOPs: 23.17 | 63: iteration 16150/ 24424 | consumed samples: 8268800 | consumed tokens: 16934502400 | elapsed time per iteration (s): 2.25 | learning rate: 6.719E-05 | global batch size: 512 | lm loss: 2.043290E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.347 | TFLOPs: 23.40 | 63: iteration 16160/ 24424 | consumed samples: 8273920 | consumed tokens: 16944988160 | elapsed time per iteration (s): 2.25 | learning rate: 6.709E-05 | global batch size: 512 | lm loss: 2.047870E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.916 | TFLOPs: 23.46 | 63: iteration 16170/ 24424 | consumed samples: 8279040 | consumed tokens: 16955473920 | elapsed time per iteration (s): 2.25 | learning rate: 6.698E-05 | global batch size: 512 | lm loss: 2.057668E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.115 | TFLOPs: 23.38 | 63: iteration 16180/ 24424 | consumed samples: 8284160 | consumed tokens: 16965959680 | elapsed time per iteration (s): 2.24 | learning rate: 6.688E-05 | global batch size: 512 | lm loss: 2.038531E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.854 | TFLOPs: 23.56 | 63: iteration 16190/ 24424 | consumed samples: 8289280 | consumed tokens: 16976445440 | elapsed time per iteration (s): 2.69 | learning rate: 6.678E-05 | global batch size: 512 | lm loss: 2.031534E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.617 | TFLOPs: 19.62 | 63: iteration 16200/ 24424 | consumed samples: 8294400 | consumed tokens: 16986931200 | elapsed time per iteration (s): 2.23 | learning rate: 6.668E-05 | global batch size: 512 | lm loss: 2.024460E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.138 | TFLOPs: 23.59 | 63: iteration 16210/ 24424 | consumed samples: 8299520 | consumed tokens: 16997416960 | elapsed time per iteration (s): 2.25 | learning rate: 6.657E-05 | global batch size: 512 | lm loss: 2.032728E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.411 | TFLOPs: 23.41 | 63: iteration 16220/ 24424 | consumed samples: 8304640 | consumed tokens: 17007902720 | elapsed time per iteration (s): 2.25 | learning rate: 6.647E-05 | global batch size: 512 | lm loss: 2.713974E+00 | grad norm: 8.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.229 | TFLOPs: 23.39 | 63: iteration 16230/ 24424 | consumed samples: 8309760 | consumed tokens: 17018388480 | elapsed time per iteration (s): 2.27 | learning rate: 6.637E-05 | global batch size: 512 | lm loss: 2.499924E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.745 | TFLOPs: 23.24 | 63: iteration 16240/ 24424 | consumed samples: 8314880 | consumed tokens: 17028874240 | elapsed time per iteration (s): 2.23 | learning rate: 6.627E-05 | global batch size: 512 | lm loss: 2.093338E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.380 | TFLOPs: 23.61 | 63: iteration 16250/ 24424 | consumed samples: 8320000 | consumed tokens: 17039360000 | elapsed time per iteration (s): 2.23 | learning rate: 6.617E-05 | global batch size: 512 | lm loss: 2.029371E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.858 | TFLOPs: 23.66 | 63: iteration 16260/ 24424 | consumed samples: 8325120 | consumed tokens: 17049845760 | elapsed time per iteration (s): 2.23 | learning rate: 6.606E-05 | global batch size: 512 | lm loss: 2.042098E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.532 | TFLOPs: 23.63 | 63: iteration 16270/ 24424 | consumed samples: 8330240 | consumed tokens: 17060331520 | elapsed time per iteration (s): 2.24 | learning rate: 6.596E-05 | global batch size: 512 | lm loss: 2.032339E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.825 | TFLOPs: 23.56 | 63: iteration 16280/ 24424 | consumed samples: 8335360 | consumed tokens: 17070817280 | elapsed time per iteration (s): 2.24 | learning rate: 6.586E-05 | global batch size: 512 | lm loss: 2.053734E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.842 | TFLOPs: 23.56 | 63: iteration 16290/ 24424 | consumed samples: 8340480 | consumed tokens: 17081303040 | elapsed time per iteration (s): 2.23 | learning rate: 6.576E-05 | global batch size: 512 | lm loss: 2.029197E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.814 | TFLOPs: 23.66 | 63: iteration 16300/ 24424 | consumed samples: 8345600 | consumed tokens: 17091788800 | elapsed time per iteration (s): 2.23 | learning rate: 6.566E-05 | global batch size: 512 | lm loss: 2.039752E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.855 | TFLOPs: 23.66 | 63: iteration 16310/ 24424 | consumed samples: 8350720 | consumed tokens: 17102274560 | elapsed time per iteration (s): 2.26 | learning rate: 6.555E-05 | global batch size: 512 | lm loss: 2.020616E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.863 | TFLOPs: 23.35 | 63: iteration 16320/ 24424 | consumed samples: 8355840 | consumed tokens: 17112760320 | elapsed time per iteration (s): 2.24 | learning rate: 6.545E-05 | global batch size: 512 | lm loss: 2.023034E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.297 | TFLOPs: 23.50 | 63: iteration 16330/ 24424 | consumed samples: 8360960 | consumed tokens: 17123246080 | elapsed time per iteration (s): 2.23 | learning rate: 6.535E-05 | global batch size: 512 | lm loss: 2.004776E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.354 | TFLOPs: 23.61 | 63: iteration 16340/ 24424 | consumed samples: 8366080 | consumed tokens: 17133731840 | elapsed time per iteration (s): 2.25 | learning rate: 6.525E-05 | global batch size: 512 | lm loss: 2.044284E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.795 | TFLOPs: 23.45 | 63: iteration 16350/ 24424 | consumed samples: 8371200 | consumed tokens: 17144217600 | elapsed time per iteration (s): 2.23 | learning rate: 6.515E-05 | global batch size: 512 | lm loss: 2.027567E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.679 | TFLOPs: 23.64 | 63: iteration 16360/ 24424 | consumed samples: 8376320 | consumed tokens: 17154703360 | elapsed time per iteration (s): 2.32 | learning rate: 6.505E-05 | global batch size: 512 | lm loss: 2.024323E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.706 | TFLOPs: 22.72 | 63: iteration 16370/ 24424 | consumed samples: 8381440 | consumed tokens: 17165189120 | elapsed time per iteration (s): 2.23 | learning rate: 6.495E-05 | global batch size: 512 | lm loss: 2.029734E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.409 | TFLOPs: 23.62 | 63: iteration 16380/ 24424 | consumed samples: 8386560 | consumed tokens: 17175674880 | elapsed time per iteration (s): 2.70 | learning rate: 6.484E-05 | global batch size: 512 | lm loss: 2.038991E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.736 | TFLOPs: 19.53 | 63: iteration 16390/ 24424 | consumed samples: 8391680 | consumed tokens: 17186160640 | elapsed time per iteration (s): 2.24 | learning rate: 6.474E-05 | global batch size: 512 | lm loss: 2.030565E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.107 | TFLOPs: 23.48 | 63: iteration 16400/ 24424 | consumed samples: 8396800 | consumed tokens: 17196646400 | elapsed time per iteration (s): 2.26 | learning rate: 6.464E-05 | global batch size: 512 | lm loss: 2.033262E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.376 | TFLOPs: 23.30 | 63: iteration 16410/ 24424 | consumed samples: 8401920 | consumed tokens: 17207132160 | elapsed time per iteration (s): 2.25 | learning rate: 6.454E-05 | global batch size: 512 | lm loss: 2.048841E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.180 | TFLOPs: 23.39 | 63: iteration 16420/ 24424 | consumed samples: 8407040 | consumed tokens: 17217617920 | elapsed time per iteration (s): 2.25 | learning rate: 6.444E-05 | global batch size: 512 | lm loss: 2.057052E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.152 | TFLOPs: 23.38 | 63: iteration 16430/ 24424 | consumed samples: 8412160 | consumed tokens: 17228103680 | elapsed time per iteration (s): 2.24 | learning rate: 6.434E-05 | global batch size: 512 | lm loss: 2.028575E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.379 | TFLOPs: 23.51 | 63: iteration 16440/ 24424 | consumed samples: 8417280 | consumed tokens: 17238589440 | elapsed time per iteration (s): 2.23 | learning rate: 6.424E-05 | global batch size: 512 | lm loss: 2.026057E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.562 | TFLOPs: 23.63 | 63: iteration 16450/ 24424 | consumed samples: 8422400 | consumed tokens: 17249075200 | elapsed time per iteration (s): 2.28 | learning rate: 6.414E-05 | global batch size: 512 | lm loss: 2.020772E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.790 | TFLOPs: 23.14 | 63: iteration 16460/ 24424 | consumed samples: 8427520 | consumed tokens: 17259560960 | elapsed time per iteration (s): 2.23 | learning rate: 6.404E-05 | global batch size: 512 | lm loss: 2.021275E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.723 | TFLOPs: 23.65 | 63: iteration 16470/ 24424 | consumed samples: 8432640 | consumed tokens: 17270046720 | elapsed time per iteration (s): 2.25 | learning rate: 6.394E-05 | global batch size: 512 | lm loss: 2.060018E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.819 | TFLOPs: 23.45 | 63: iteration 16480/ 24424 | consumed samples: 8437760 | consumed tokens: 17280532480 | elapsed time per iteration (s): 2.25 | learning rate: 6.384E-05 | global batch size: 512 | lm loss: 2.041519E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.677 | TFLOPs: 23.44 | 63: iteration 16490/ 24424 | consumed samples: 8442880 | consumed tokens: 17291018240 | elapsed time per iteration (s): 2.25 | learning rate: 6.374E-05 | global batch size: 512 | lm loss: 2.048069E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.757 | TFLOPs: 23.45 | 63: iteration 16500/ 24424 | consumed samples: 8448000 | consumed tokens: 17301504000 | elapsed time per iteration (s): 2.23 | learning rate: 6.364E-05 | global batch size: 512 | lm loss: 2.035447E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.945 | TFLOPs: 23.67 | 63: iteration 16510/ 24424 | consumed samples: 8453120 | consumed tokens: 17311989760 | elapsed time per iteration (s): 2.82 | learning rate: 6.354E-05 | global batch size: 512 | lm loss: 2.047283E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.491 | TFLOPs: 18.68 | 63: iteration 16520/ 24424 | consumed samples: 8458240 | consumed tokens: 17322475520 | elapsed time per iteration (s): 2.25 | learning rate: 6.344E-05 | global batch size: 512 | lm loss: 2.023179E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.906 | TFLOPs: 23.46 | 63: iteration 16530/ 24424 | consumed samples: 8463360 | consumed tokens: 17332961280 | elapsed time per iteration (s): 2.28 | learning rate: 6.334E-05 | global batch size: 512 | lm loss: 2.011707E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.787 | TFLOPs: 23.14 | 63: iteration 16540/ 24424 | consumed samples: 8468480 | consumed tokens: 17343447040 | elapsed time per iteration (s): 2.23 | learning rate: 6.324E-05 | global batch size: 512 | lm loss: 2.010793E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.101 | TFLOPs: 23.58 | 63: iteration 16550/ 24424 | consumed samples: 8473600 | consumed tokens: 17353932800 | elapsed time per iteration (s): 2.25 | learning rate: 6.314E-05 | global batch size: 512 | lm loss: 2.031149E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.967 | TFLOPs: 23.47 | 63: iteration 16560/ 24424 | consumed samples: 8478720 | consumed tokens: 17364418560 | elapsed time per iteration (s): 2.23 | learning rate: 6.304E-05 | global batch size: 512 | lm loss: 2.012449E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.995 | TFLOPs: 23.68 | 63: iteration 16570/ 24424 | consumed samples: 8483840 | consumed tokens: 17374904320 | elapsed time per iteration (s): 2.24 | learning rate: 6.294E-05 | global batch size: 512 | lm loss: 2.008348E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.642 | TFLOPs: 23.54 | 63: iteration 16580/ 24424 | consumed samples: 8488960 | consumed tokens: 17385390080 | elapsed time per iteration (s): 2.23 | learning rate: 6.284E-05 | global batch size: 512 | lm loss: 2.026709E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.436 | TFLOPs: 23.62 | 63: iteration 16590/ 24424 | consumed samples: 8494080 | consumed tokens: 17395875840 | elapsed time per iteration (s): 2.28 | learning rate: 6.274E-05 | global batch size: 512 | lm loss: 2.028781E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.952 | TFLOPs: 23.16 | 63: iteration 16600/ 24424 | consumed samples: 8499200 | consumed tokens: 17406361600 | elapsed time per iteration (s): 2.24 | learning rate: 6.264E-05 | global batch size: 512 | lm loss: 2.024026E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.243 | TFLOPs: 23.50 | 63: iteration 16610/ 24424 | consumed samples: 8504320 | consumed tokens: 17416847360 | elapsed time per iteration (s): 2.23 | learning rate: 6.254E-05 | global batch size: 512 | lm loss: 2.031678E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.145 | TFLOPs: 23.59 | 63: iteration 16620/ 24424 | consumed samples: 8509440 | consumed tokens: 17427333120 | elapsed time per iteration (s): 2.23 | learning rate: 6.244E-05 | global batch size: 512 | lm loss: 2.036243E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.253 | TFLOPs: 23.60 | 63: iteration 16630/ 24424 | consumed samples: 8514560 | consumed tokens: 17437818880 | elapsed time per iteration (s): 2.24 | learning rate: 6.234E-05 | global batch size: 512 | lm loss: 2.013413E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.497 | TFLOPs: 23.52 | 63: iteration 16640/ 24424 | consumed samples: 8519680 | consumed tokens: 17448304640 | elapsed time per iteration (s): 2.25 | learning rate: 6.224E-05 | global batch size: 512 | lm loss: 2.063626E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.320 | TFLOPs: 23.40 | 63: iteration 16650/ 24424 | consumed samples: 8524800 | consumed tokens: 17458790400 | elapsed time per iteration (s): 2.24 | learning rate: 6.214E-05 | global batch size: 512 | lm loss: 2.025647E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.760 | TFLOPs: 23.55 | 63: iteration 16660/ 24424 | consumed samples: 8529920 | consumed tokens: 17469276160 | elapsed time per iteration (s): 2.24 | learning rate: 6.204E-05 | global batch size: 512 | lm loss: 2.061679E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.636 | TFLOPs: 23.54 | 63: iteration 16670/ 24424 | consumed samples: 8535040 | consumed tokens: 17479761920 | elapsed time per iteration (s): 2.43 | learning rate: 6.194E-05 | global batch size: 512 | lm loss: 2.034463E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 211.017 | TFLOPs: 21.72 | 63: iteration 16680/ 24424 | consumed samples: 8540160 | consumed tokens: 17490247680 | elapsed time per iteration (s): 2.25 | learning rate: 6.184E-05 | global batch size: 512 | lm loss: 2.015485E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.693 | TFLOPs: 23.44 | 63: iteration 16690/ 24424 | consumed samples: 8545280 | consumed tokens: 17500733440 | elapsed time per iteration (s): 2.23 | learning rate: 6.175E-05 | global batch size: 512 | lm loss: 2.041762E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.396 | TFLOPs: 23.62 | 63: iteration 16700/ 24424 | consumed samples: 8550400 | consumed tokens: 17511219200 | elapsed time per iteration (s): 2.23 | learning rate: 6.165E-05 | global batch size: 512 | lm loss: 2.035989E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.927 | TFLOPs: 23.67 | 63: iteration 16710/ 24424 | consumed samples: 8555520 | consumed tokens: 17521704960 | elapsed time per iteration (s): 2.28 | learning rate: 6.155E-05 | global batch size: 512 | lm loss: 2.019825E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.038 | TFLOPs: 23.17 | 63: iteration 16720/ 24424 | consumed samples: 8560640 | consumed tokens: 17532190720 | elapsed time per iteration (s): 2.24 | learning rate: 6.145E-05 | global batch size: 512 | lm loss: 2.015733E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.174 | TFLOPs: 23.49 | 63: iteration 16730/ 24424 | consumed samples: 8565760 | consumed tokens: 17542676480 | elapsed time per iteration (s): 2.25 | learning rate: 6.135E-05 | global batch size: 512 | lm loss: 2.053358E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.020 | TFLOPs: 23.47 | 63: iteration 16740/ 24424 | consumed samples: 8570880 | consumed tokens: 17553162240 | elapsed time per iteration (s): 2.24 | learning rate: 6.125E-05 | global batch size: 512 | lm loss: 2.022976E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.960 | TFLOPs: 23.57 | 63: iteration 16750/ 24424 | consumed samples: 8576000 | consumed tokens: 17563648000 | elapsed time per iteration (s): 2.26 | learning rate: 6.116E-05 | global batch size: 512 | lm loss: 2.031424E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.592 | TFLOPs: 23.33 | 63: iteration 16760/ 24424 | consumed samples: 8581120 | consumed tokens: 17574133760 | elapsed time per iteration (s): 2.26 | learning rate: 6.106E-05 | global batch size: 512 | lm loss: 2.022267E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.257 | TFLOPs: 23.29 | 63: iteration 16770/ 24424 | consumed samples: 8586240 | consumed tokens: 17584619520 | elapsed time per iteration (s): 2.24 | learning rate: 6.096E-05 | global batch size: 512 | lm loss: 2.024177E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.034 | TFLOPs: 23.58 | 63: iteration 16780/ 24424 | consumed samples: 8591360 | consumed tokens: 17595105280 | elapsed time per iteration (s): 2.24 | learning rate: 6.086E-05 | global batch size: 512 | lm loss: 2.032541E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.162 | TFLOPs: 23.49 | 63: iteration 16790/ 24424 | consumed samples: 8596480 | consumed tokens: 17605591040 | elapsed time per iteration (s): 2.24 | learning rate: 6.076E-05 | global batch size: 512 | lm loss: 2.032705E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.657 | TFLOPs: 23.54 | 63: iteration 16800/ 24424 | consumed samples: 8601600 | consumed tokens: 17616076800 | elapsed time per iteration (s): 2.29 | learning rate: 6.067E-05 | global batch size: 512 | lm loss: 2.026592E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.523 | TFLOPs: 23.01 | 63: iteration 16810/ 24424 | consumed samples: 8606720 | consumed tokens: 17626562560 | elapsed time per iteration (s): 2.26 | learning rate: 6.057E-05 | global batch size: 512 | lm loss: 2.025759E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.945 | TFLOPs: 23.36 | 63: iteration 16820/ 24424 | consumed samples: 8611840 | consumed tokens: 17637048320 | elapsed time per iteration (s): 2.26 | learning rate: 6.047E-05 | global batch size: 512 | lm loss: 2.027674E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.870 | TFLOPs: 23.36 | 63: iteration 16830/ 24424 | consumed samples: 8616960 | consumed tokens: 17647534080 | elapsed time per iteration (s): 2.73 | learning rate: 6.037E-05 | global batch size: 512 | lm loss: 2.032380E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.704 | TFLOPs: 19.32 | 63: iteration 16840/ 24424 | consumed samples: 8622080 | consumed tokens: 17658019840 | elapsed time per iteration (s): 2.28 | learning rate: 6.027E-05 | global batch size: 512 | lm loss: 2.016614E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.335 | TFLOPs: 23.09 | 63: iteration 16850/ 24424 | consumed samples: 8627200 | consumed tokens: 17668505600 | elapsed time per iteration (s): 2.23 | learning rate: 6.018E-05 | global batch size: 512 | lm loss: 2.030783E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.933 | TFLOPs: 23.67 | 63: iteration 16860/ 24424 | consumed samples: 8632320 | consumed tokens: 17678991360 | elapsed time per iteration (s): 2.30 | learning rate: 6.008E-05 | global batch size: 512 | lm loss: 2.040903E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.043 | TFLOPs: 22.96 | 63: iteration 16870/ 24424 | consumed samples: 8637440 | consumed tokens: 17689477120 | elapsed time per iteration (s): 2.28 | learning rate: 5.998E-05 | global batch size: 512 | lm loss: 2.041068E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.731 | TFLOPs: 23.14 | 63: iteration 16880/ 24424 | consumed samples: 8642560 | consumed tokens: 17699962880 | elapsed time per iteration (s): 2.27 | learning rate: 5.989E-05 | global batch size: 512 | lm loss: 2.051957E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.308 | TFLOPs: 23.19 | 63: iteration 16890/ 24424 | consumed samples: 8647680 | consumed tokens: 17710448640 | elapsed time per iteration (s): 2.31 | learning rate: 5.979E-05 | global batch size: 512 | lm loss: 2.028241E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.860 | TFLOPs: 22.84 | 63: iteration 16900/ 24424 | consumed samples: 8652800 | consumed tokens: 17720934400 | elapsed time per iteration (s): 2.23 | learning rate: 5.969E-05 | global batch size: 512 | lm loss: 2.036758E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.596 | TFLOPs: 23.64 | 63: iteration 16910/ 24424 | consumed samples: 8657920 | consumed tokens: 17731420160 | elapsed time per iteration (s): 2.32 | learning rate: 5.959E-05 | global batch size: 512 | lm loss: 2.038942E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.304 | TFLOPs: 22.68 | 63: iteration 16920/ 24424 | consumed samples: 8663040 | consumed tokens: 17741905920 | elapsed time per iteration (s): 2.24 | learning rate: 5.950E-05 | global batch size: 512 | lm loss: 2.050536E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.082 | TFLOPs: 23.58 | 63: iteration 16930/ 24424 | consumed samples: 8668160 | consumed tokens: 17752391680 | elapsed time per iteration (s): 2.23 | learning rate: 5.940E-05 | global batch size: 512 | lm loss: 2.032125E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.422 | TFLOPs: 23.62 | 63: iteration 16940/ 24424 | consumed samples: 8673280 | consumed tokens: 17762877440 | elapsed time per iteration (s): 2.24 | learning rate: 5.930E-05 | global batch size: 512 | lm loss: 2.015241E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.745 | TFLOPs: 23.55 | 63: iteration 16950/ 24424 | consumed samples: 8678400 | consumed tokens: 17773363200 | elapsed time per iteration (s): 2.24 | learning rate: 5.921E-05 | global batch size: 512 | lm loss: 2.018822E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.341 | TFLOPs: 23.51 | 63: iteration 16960/ 24424 | consumed samples: 8683520 | consumed tokens: 17783848960 | elapsed time per iteration (s): 2.25 | learning rate: 5.911E-05 | global batch size: 512 | lm loss: 2.033453E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.177 | TFLOPs: 23.39 | 63: iteration 16970/ 24424 | consumed samples: 8688640 | consumed tokens: 17794334720 | elapsed time per iteration (s): 2.26 | learning rate: 5.901E-05 | global batch size: 512 | lm loss: 2.012827E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.344 | TFLOPs: 23.30 | 63: iteration 16980/ 24424 | consumed samples: 8693760 | consumed tokens: 17804820480 | elapsed time per iteration (s): 2.27 | learning rate: 5.892E-05 | global batch size: 512 | lm loss: 2.026025E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.594 | TFLOPs: 23.22 | 63: iteration 16990/ 24424 | consumed samples: 8698880 | consumed tokens: 17815306240 | elapsed time per iteration (s): 2.59 | learning rate: 5.882E-05 | global batch size: 512 | lm loss: 2.012202E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 197.550 | TFLOPs: 20.34 | 63: iteration 17000/ 24424 | consumed samples: 8704000 | consumed tokens: 17825792000 | elapsed time per iteration (s): 2.37 | learning rate: 5.873E-05 | global batch size: 512 | lm loss: 2.036496E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 216.281 | TFLOPs: 22.27 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 17000 | lm loss value: 1.987995E+00 | lm loss PPL: 7.300881E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 17000 to checkpoints_3b9 0: [2022-11-26 04:44:59,171] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step17000 is begin to save! 32: [2022-11-26 04:44:59,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_21-model_00-model_states.pt... 0: [2022-11-26 04:44:59,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_01-model_00-model_states.pt... 32: [2022-11-26 04:44:59,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_21-model_00-model_states.pt. 32: [2022-11-26 04:44:59,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_22-model_00-model_states.pt... 0: [2022-11-26 04:44:59,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_01-model_00-model_states.pt. 0: [2022-11-26 04:44:59,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_03-model_00-model_states.pt... 32: [2022-11-26 04:44:59,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_22-model_00-model_states.pt. 32: [2022-11-26 04:44:59,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_23-model_00-model_states.pt... 0: [2022-11-26 04:44:59,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_03-model_00-model_states.pt. 0: [2022-11-26 04:44:59,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_04-model_00-model_states.pt... 32: [2022-11-26 04:44:59,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_23-model_00-model_states.pt. 32: [2022-11-26 04:44:59,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_24-model_00-model_states.pt... 0: [2022-11-26 04:45:00,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_04-model_00-model_states.pt. 0: [2022-11-26 04:45:00,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_05-model_00-model_states.pt... 32: [2022-11-26 04:45:00,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_24-model_00-model_states.pt. 32: [2022-11-26 04:45:00,170] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_25-model_00-model_states.pt... 0: [2022-11-26 04:45:00,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_05-model_00-model_states.pt. 0: [2022-11-26 04:45:00,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_06-model_00-model_states.pt... 32: [2022-11-26 04:45:00,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_25-model_00-model_states.pt. 32: [2022-11-26 04:45:00,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_26-model_00-model_states.pt... 0: [2022-11-26 04:45:00,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_06-model_00-model_states.pt. 0: [2022-11-26 04:45:00,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_07-model_00-model_states.pt... 32: [2022-11-26 04:45:00,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_26-model_00-model_states.pt. 32: [2022-11-26 04:45:00,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_27-model_00-model_states.pt... 0: [2022-11-26 04:45:00,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_07-model_00-model_states.pt. 0: [2022-11-26 04:45:00,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_08-model_00-model_states.pt... 32: [2022-11-26 04:45:00,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_27-model_00-model_states.pt. 32: [2022-11-26 04:45:00,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_28-model_00-model_states.pt... 0: [2022-11-26 04:45:00,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_08-model_00-model_states.pt. 0: [2022-11-26 04:45:00,926] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_09-model_00-model_states.pt... 32: [2022-11-26 04:45:01,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_28-model_00-model_states.pt. 32: [2022-11-26 04:45:01,091] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_29-model_00-model_states.pt... 0: [2022-11-26 04:45:01,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_09-model_00-model_states.pt. 0: [2022-11-26 04:45:01,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_10-model_00-model_states.pt... 32: [2022-11-26 04:45:01,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_29-model_00-model_states.pt. 32: [2022-11-26 04:45:01,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_30-model_00-model_states.pt... 0: [2022-11-26 04:45:01,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_10-model_00-model_states.pt. 0: [2022-11-26 04:45:01,376] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_11-model_00-model_states.pt... 32: [2022-11-26 04:45:01,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_30-model_00-model_states.pt. 32: [2022-11-26 04:45:01,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_31-model_00-model_states.pt... 0: [2022-11-26 04:45:01,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_11-model_00-model_states.pt. 0: [2022-11-26 04:45:01,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_12-model_00-model_states.pt... 32: [2022-11-26 04:45:01,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_31-model_00-model_states.pt. 32: [2022-11-26 04:45:01,784] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_32-model_00-model_states.pt... 0: [2022-11-26 04:45:01,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_12-model_00-model_states.pt. 0: [2022-11-26 04:45:01,830] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_13-model_00-model_states.pt... 32: [2022-11-26 04:45:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_32-model_00-model_states.pt. 32: [2022-11-26 04:45:02,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_33-model_00-model_states.pt... 0: [2022-11-26 04:45:02,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_13-model_00-model_states.pt. 0: [2022-11-26 04:45:02,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_14-model_00-model_states.pt... 32: [2022-11-26 04:45:02,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_33-model_00-model_states.pt. 32: [2022-11-26 04:45:02,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_34-model_00-model_states.pt... 0: [2022-11-26 04:45:02,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_14-model_00-model_states.pt. 0: [2022-11-26 04:45:02,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_15-model_00-model_states.pt... 32: [2022-11-26 04:45:02,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_34-model_00-model_states.pt. 32: [2022-11-26 04:45:02,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_35-model_00-model_states.pt... 0: [2022-11-26 04:45:02,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_15-model_00-model_states.pt. 0: [2022-11-26 04:45:02,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_16-model_00-model_states.pt... 32: [2022-11-26 04:45:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_35-model_00-model_states.pt. 32: [2022-11-26 04:45:02,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_36-model_00-model_states.pt... 0: [2022-11-26 04:45:02,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_16-model_00-model_states.pt. 0: [2022-11-26 04:45:02,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_17-model_00-model_states.pt... 32: [2022-11-26 04:45:02,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_36-model_00-model_states.pt. 32: [2022-11-26 04:45:02,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_37-model_00-model_states.pt... 0: [2022-11-26 04:45:02,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_17-model_00-model_states.pt. 0: [2022-11-26 04:45:02,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_18-model_00-model_states.pt... 32: [2022-11-26 04:45:03,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_37-model_00-model_states.pt. 32: [2022-11-26 04:45:03,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_38-model_00-model_states.pt... 0: [2022-11-26 04:45:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_18-model_00-model_states.pt. 0: [2022-11-26 04:45:03,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_19-model_00-model_states.pt... 32: [2022-11-26 04:45:03,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_38-model_00-model_states.pt. 32: [2022-11-26 04:45:03,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_40-model_00-model_states.pt... 32: [2022-11-26 04:45:03,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_40-model_00-model_states.pt. 32: [2022-11-26 04:45:03,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/mp_rank_01_model_states.pt... 32: [2022-11-26 04:45:03,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/mp_rank_01_model_states.pt. 0: [2022-11-26 04:45:03,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_19-model_00-model_states.pt. 0: [2022-11-26 04:45:03,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/layer_20-model_00-model_states.pt... 0: [2022-11-26 04:45:03,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/layer_20-model_00-model_states.pt. 0: [2022-11-26 04:45:03,617] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step17000/mp_rank_00_model_states.pt 0: [2022-11-26 04:45:03,617] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/mp_rank_00_model_states.pt... 0: [2022-11-26 04:45:03,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/mp_rank_00_model_states.pt. 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 52: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 35: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 59: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 56: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 55: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 46: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 36: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 45: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 63: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 57: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 19: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 11: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 6: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 49: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 16: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 18: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 0: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 9: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 25: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 29: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 3: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 1: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 30: [2022-11-26 04:45:04,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step17000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 26: [2022-11-26 04:45:04,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:45:04,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:04,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:04,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:04,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:04,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:04,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:04,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:04,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:04,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:04,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:45:04,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 04:45:04,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:04,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:45:04,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:45:04,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:04,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:04,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:04,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 04:45:04,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 38: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 11: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 38: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 11: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 44: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 21: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 44: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 27: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:04,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:04,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:04,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:04,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:04,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:04,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 43: [2022-11-26 04:45:04,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:04,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:04,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 31: [2022-11-26 04:45:04,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 33: [2022-11-26 04:45:04,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 31: [2022-11-26 04:45:04,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 33: [2022-11-26 04:45:04,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 31: [2022-11-26 04:45:04,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:04,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:04,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:04,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:04,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 17: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 58: [2022-11-26 04:45:04,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 17: [2022-11-26 04:45:04,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 58: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 12: [2022-11-26 04:45:04,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 61: [2022-11-26 04:45:04,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 12: [2022-11-26 04:45:04,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:04,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:04,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:04,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:04,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:04,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:04,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 0: [2022-11-26 04:45:04,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:04,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 56: [2022-11-26 04:45:04,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 0: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 56: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 59: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:04,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:04,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 04:45:04,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:04,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:04,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:04,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:04,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:45:04,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:04,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:04,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:45:04,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 42: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 8: [2022-11-26 04:45:04,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:04,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 8: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 25: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:45:04,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:45:04,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:04,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:04,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:45:04,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 24: [2022-11-26 04:45:04,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:04,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 04:45:04,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:04,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:04,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 04:45:04,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:04,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:04,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:45:04,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:04,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:04,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:45:04,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:04,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:04,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:04,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 04:45:04,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:04,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 04:45:04,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 11: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 49: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:04,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 30: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 49: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:04,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:04,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:04,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:04,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:04,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:04,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:04,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 04:45:04,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:04,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:45:04,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 04:45:04,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:04,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:04,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 04:45:04,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:04,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:04,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:04,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:04,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:04,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 04:45:04,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 17: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 37: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:45:04,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:04,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:04,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 59: [2022-11-26 04:45:04,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 28: [2022-11-26 04:45:04,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 59: [2022-11-26 04:45:04,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 28: [2022-11-26 04:45:04,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:04,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 43: [2022-11-26 04:45:04,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:04,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:04,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:04,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:04,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:04,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:04,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:04,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 0: [2022-11-26 04:45:04,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 42: [2022-11-26 04:45:04,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:04,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:04,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:04,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:04,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:04,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:04,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:45:04,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 26: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 35: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:04,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 47: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:04,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:04,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:04,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:04,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 58: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 19: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:04,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 25: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 44: [2022-11-26 04:45:04,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 6: [2022-11-26 04:45:04,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 44: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:04,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:45:04,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:04,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 25: [2022-11-26 04:45:04,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:45:04,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:04,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:04,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 13: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 60: [2022-11-26 04:45:04,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 60: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 13: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 13: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:04,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:04,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 45: [2022-11-26 04:45:04,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:45:04,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 28: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 20: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 45: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:45:04,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:04,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 54: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 27: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 32: [2022-11-26 04:45:04,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:04,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:04,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:04,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:04,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:04,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:04,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:04,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:04,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 3: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 48: [2022-11-26 04:45:04,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:04,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:04,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:04,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:04,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:04,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:04,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:45:04,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 11: [2022-11-26 04:45:04,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:45:04,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 04:45:04,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:04,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:04,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:04,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:45:04,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 21: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 45: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 21: [2022-11-26 04:45:04,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 45: [2022-11-26 04:45:04,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 21: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:04,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:04,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 62: [2022-11-26 04:45:04,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 15: [2022-11-26 04:45:04,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 62: [2022-11-26 04:45:04,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:04,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:04,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:04,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:45:04,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 42: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 1: [2022-11-26 04:45:04,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:04,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:04,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 63: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:04,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:04,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:04,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:04,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:04,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:04,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:45:04,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 04:45:04,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 26: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 36: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:04,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 34: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 25: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 34: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 25: [2022-11-26 04:45:04,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:04,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 57: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 30: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 36: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 36: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 24: [2022-11-26 04:45:04,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 36: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:04,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 54: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 31: [2022-11-26 04:45:04,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 47: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:45:04,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 31: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:04,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 54: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:45:04,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:04,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:04,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 43: [2022-11-26 04:45:04,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:04,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:04,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:04,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:04,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:04,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:04,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:45:04,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:04,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:04,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:04,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 24: [2022-11-26 04:45:04,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 33: [2022-11-26 04:45:04,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:04,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 04:45:04,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:04,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:04,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:04,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:04,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 29: [2022-11-26 04:45:04,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:04,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 38: [2022-11-26 04:45:04,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:45:04,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 29: [2022-11-26 04:45:04,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:04,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:04,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:04,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 04:45:04,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:04,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:04,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:04,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:04,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:04,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 04:45:04,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:04,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 11: [2022-11-26 04:45:04,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 61: [2022-11-26 04:45:04,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 11: [2022-11-26 04:45:04,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 61: [2022-11-26 04:45:04,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:04,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:04,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:04,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:04,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:04,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:04,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:04,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:04,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:04,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:04,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:04,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:04,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:04,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:04,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:04,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:04,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:04,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:04,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:04,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 04:45:04,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:04,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:45:04,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:04,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:04,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:04,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:04,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:04,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:04,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:04,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:04,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:04,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:04,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:45:04,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 04:45:04,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:04,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:04,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:04,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:04,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:04,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:04,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:04,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:04,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:04,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:04,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:04,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:04,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:04,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:04,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:04,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:04,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:45:04,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:04,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:45:04,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:45:04,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 04:45:04,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 04:45:04,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:04,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:04,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:04,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:04,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:45:04,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:04,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:04,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:04,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:04,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:04,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:04,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:04,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:45:04,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:04,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:04,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:45:04,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:04,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:04,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:04,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:04,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:04,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:45:04,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:04,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:04,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:04,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:04,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:04,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:04,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:04,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:04,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 04:45:04,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:04,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:04,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 04:45:04,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:04,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:04,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:04,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:04,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:45:04,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 04:45:04,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:04,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:04,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 04:45:04,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:04,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:45:04,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:04,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:04,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:45:04,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 04:45:04,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:04,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:04,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 04:45:04,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:04,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:04,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 04:45:04,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:04,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:45:04,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:04,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:04,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:45:04,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:04,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:04,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:04,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 04:45:04,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:04,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:04,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:04,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:04,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:04,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:04,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:04,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:04,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 04:45:04,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:04,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:45:04,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 04:45:04,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:04,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:04,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 04:45:04,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:04,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:04,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:04,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:04,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:45:04,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:04,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:45:04,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:04,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:04,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:04,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:04,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 04:45:04,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:04,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:04,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:04,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:45:04,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 57: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:04,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:04,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:04,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:04,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:04,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:04,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:04,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:04,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:45:04,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:04,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:04,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:04,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:04,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:04,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:04,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 04:45:04,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:04,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:45:04,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 04:45:04,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:04,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:45:04,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:04,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:04,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:45:04,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:04,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:04,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:04,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 04:45:04,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:04,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:04,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:04,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:04,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:04,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:04,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:04,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:04,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 25: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:45:04,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:04,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:04,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:04,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:04,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:04,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:45:04,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 04:45:04,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:04,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:45:04,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 04:45:04,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:04,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:04,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:04,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:04,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:04,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:04,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:04,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:04,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:04,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:04,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:45:04,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:04,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:04,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:45:04,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 04:45:04,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:04,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:04,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 04:45:04,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:04,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:45:04,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:04,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:04,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:04,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:04,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:04,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:45:04,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 04:45:04,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:04,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:45:04,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:04,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:04,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:45:04,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 04:45:04,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:04,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:04,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:04,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:04,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:04,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:04,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:45:04,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:04,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:45:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:04,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:45:04,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:04,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:05,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:45:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 43: [2022-11-26 04:45:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:45:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:05,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:45:05,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 04:45:05,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:05,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:45:05,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 04:45:05,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:05,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:05,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 04:45:05,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:05,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:05,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 04:45:05,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:05,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:05,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 04:45:05,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:05,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:45:05,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:05,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:05,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:45:05,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:05,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:05,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:05,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 04:45:05,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:05,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:05,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 04:45:05,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:05,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:05,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:05,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:05,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:05,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:05,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:05,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:05,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 04:45:05,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:05,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:05,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 04:45:05,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:05,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:05,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:05,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:05,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:45:05,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:05,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:05,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:45:05,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 35: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 30: [2022-11-26 04:45:05,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 35: [2022-11-26 04:45:05,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:05,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:05,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:05,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:05,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:05,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:05,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:05,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:05,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:05,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:05,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:05,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:05,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:45:05,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:05,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:05,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:05,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 04:45:05,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:05,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:45:05,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:05,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:05,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:05,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:05,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:05,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:05,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 04:45:05,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:05,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:05,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 04:45:05,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:05,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:05,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 04:45:05,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:05,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:05,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:05,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:05,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:05,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:05,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:05,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:45:05,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:45:05,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:05,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:05,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:05,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:05,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:05,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:05,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:05,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:45:05,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:05,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:05,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:45:05,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 04:45:05,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:05,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:05,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 04:45:05,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:05,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:05,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 04:45:05,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:05,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:45:05,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 04:45:05,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:05,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:05,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:05,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:05,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:05,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 04:45:05,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:05,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:45:05,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:05,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 25: [2022-11-26 04:45:05,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:45:05,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:05,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:05,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:45:05,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:45:05,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:05,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:45:05,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:05,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:05,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:05,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:05,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:05,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:45:05,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:05,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:05,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:45:05,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 04:45:05,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:05,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:05,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:05,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:05,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:45:05,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:05,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:05,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:45:05,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 04:45:05,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:05,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:05,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:05,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:05,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:05,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:05,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:05,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:45:05,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:05,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:05,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:45:05,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:05,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:05,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:05,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:05,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:05,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:45:05,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:05,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 43: [2022-11-26 04:45:05,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:05,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:05,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:05,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:05,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 04:45:05,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:05,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:45:05,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 04:45:05,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:05,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:45:05,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 04:45:05,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:05,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:05,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:05,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:05,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:45:05,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:05,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:05,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:05,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:05,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:05,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:05,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:05,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:05,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:05,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 04:45:05,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:05,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:45:05,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:05,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:45:05,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:05,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 04:45:05,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:05,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:45:05,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 04:45:05,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:05,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:05,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 04:45:05,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:05,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:05,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:05,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:05,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:05,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 04:45:05,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:05,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:45:05,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 04:45:05,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:05,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:05,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 04:45:05,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:05,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:45:05,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:05,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:05,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:05,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:05,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:05,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:45:05,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:05,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:05,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:05,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:05,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:05,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:45:05,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:05,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:05,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:05,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:05,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:05,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 57: [2022-11-26 04:45:05,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:05,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:05,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:05,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 41: [2022-11-26 04:45:05,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 8: [2022-11-26 04:45:05,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:05,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:05,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:05,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 04:45:05,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 04:45:05,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:05,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 04:45:05,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 04:45:05,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:05,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:05,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 04:45:05,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:05,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 36: [2022-11-26 04:45:05,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 29: [2022-11-26 04:45:05,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:05,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:05,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 04:45:05,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:05,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:05,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 04:45:05,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:05,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 04:45:05,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 04:45:05,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 2: [2022-11-26 04:45:05,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:05,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 04:45:05,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:05,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:05,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 04:45:05,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:05,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:45:05,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:05,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:05,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 04:45:05,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:05,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 25: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 25: [2022-11-26 04:45:05,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:05,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:05,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:05,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:05,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:05,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:05,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:05,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 4: [2022-11-26 04:45:05,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:05,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:05,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:05,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:05,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:05,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:05,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:05,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 04:45:05,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:05,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-26 04:45:05,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 04:45:05,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:05,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:05,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:05,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:05,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:45:05,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 11: [2022-11-26 04:45:05,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:45:05,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 61: [2022-11-26 04:45:05,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 11: [2022-11-26 04:45:05,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:05,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:45:05,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 04:45:05,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:05,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 04:45:05,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:05,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:05,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:45:05,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:05,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:05,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:05,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:05,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:05,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 04:45:05,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 04:45:05,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:05,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:05,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 04:45:05,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:05,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 21: [2022-11-26 04:45:05,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 59: [2022-11-26 04:45:05,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 04:45:05,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:05,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:05,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:05,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 04:45:05,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:05,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:05,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:05,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:05,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:05,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 04:45:05,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:05,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 43: [2022-11-26 04:45:05,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 04:45:05,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:05,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:05,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 04:45:05,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:05,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 44: [2022-11-26 04:45:05,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 04:45:05,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 04:45:05,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:05,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:05,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:05,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:05,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 39: [2022-11-26 04:45:05,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 52: [2022-11-26 04:45:05,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:05,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:05,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:05,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:05,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:45:05,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:05,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:05,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 63: [2022-11-26 04:45:05,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:05,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 31: [2022-11-26 04:45:05,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 63: [2022-11-26 04:45:05,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:05,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:05,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 04:45:05,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:05,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:05,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 04:45:05,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 04:45:05,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 28: [2022-11-26 04:45:05,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:05,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 04:45:05,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 30: [2022-11-26 04:45:05,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:45:05,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 04:45:05,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:05,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 04:45:05,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 13: [2022-11-26 04:45:05,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 35: [2022-11-26 04:45:05,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:05,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 04:45:05,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 50: [2022-11-26 04:45:05,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:05,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:05,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:05,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:05,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:05,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:05,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 04:45:05,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 04:45:05,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 19: [2022-11-26 04:45:05,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 04:45:05,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:05,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 34: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 04:45:05,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 04:45:05,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 04:45:05,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:05,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:05,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 04:45:05,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:05,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:05,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 04:45:05,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:05,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:05,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:05,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 41: [2022-11-26 04:45:05,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 04:45:05,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 04:45:05,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 29: [2022-11-26 04:45:05,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 04:45:05,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 04:45:05,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 21: [2022-11-26 04:45:05,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 04:45:05,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 04:45:05,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:05,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 31: [2022-11-26 04:45:05,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:05,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 31: [2022-11-26 04:45:05,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:05,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 31: [2022-11-26 04:45:05,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: [2022-11-26 04:45:05,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 04:45:05,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 04:45:05,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 18: [2022-11-26 04:45:05,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 04:45:05,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 04:45:05,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 55: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 04:45:05,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 2: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 44: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 12: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 2: [2022-11-26 04:45:05,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 44: [2022-11-26 04:45:05,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 54: [2022-11-26 04:45:05,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 2: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:05,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 54: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 12: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:05,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 24: [2022-11-26 04:45:05,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 24: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 1: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 58: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 50: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 59: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 52: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 52: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 56: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 50: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:05,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 56: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 59: [2022-11-26 04:45:05,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:05,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:05,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 04:45:05,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:05,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:05,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 6: [2022-11-26 04:45:05,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 8: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 6: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 25: [2022-11-26 04:45:05,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 04:45:05,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 28: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 37: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 36: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 36: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 42: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 36: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 19: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 32: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 19: [2022-11-26 04:45:05,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 32: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 33: [2022-11-26 04:45:05,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 53: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 33: [2022-11-26 04:45:05,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 53: [2022-11-26 04:45:05,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 9: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 04:45:05,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 46: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 04:45:05,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 04:45:05,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 49: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:05,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 38: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 49: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 38: [2022-11-26 04:45:05,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 30: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 38: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 61: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 04:45:05,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 4: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 61: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 30: [2022-11-26 04:45:05,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 4: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 13: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 30: [2022-11-26 04:45:05,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 4: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 13: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 17: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 3: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 17: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 3: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 16: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 23: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 35: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 57: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 23: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 35: [2022-11-26 04:45:05,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 23: [2022-11-26 04:45:05,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 35: [2022-11-26 04:45:05,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 57: [2022-11-26 04:45:05,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 04:45:05,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 7: [2022-11-26 04:45:05,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 04:45:05,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 04:45:05,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 10: [2022-11-26 04:45:05,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 40: [2022-11-26 04:45:05,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 10: [2022-11-26 04:45:05,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 40: [2022-11-26 04:45:05,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 40: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 43: [2022-11-26 04:45:05,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 51: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 04:45:05,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 15: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 27: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 51: [2022-11-26 04:45:05,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 15: [2022-11-26 04:45:05,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 27: [2022-11-26 04:45:05,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 37: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 15: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 27: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 37: [2022-11-26 04:45:05,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 5: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 37: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 5: [2022-11-26 04:45:05,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 45: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 5: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 45: [2022-11-26 04:45:05,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 48: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 04:45:05,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 04:45:05,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 22: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 20: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 11: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 22: [2022-11-26 04:45:05,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 11: [2022-11-26 04:45:05,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 22: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:05,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 11: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 20: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:45:05,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 39: [2022-11-26 04:45:05,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 39: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 60: [2022-11-26 04:45:05,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 04:45:05,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 63: [2022-11-26 04:45:05,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 04:45:05,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 04:45:05,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 8: [2022-11-26 04:45:05,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 04:45:05,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 04:45:05,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 26: [2022-11-26 04:45:05,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 04:45:05,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 04:45:05,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 42: [2022-11-26 04:45:05,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 04:45:05,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 04:45:05,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 62: [2022-11-26 04:45:05,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 04:45:05,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 04:45:05,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 47: [2022-11-26 04:45:05,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 04:45:05,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 04:45:05,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 14: [2022-11-26 04:45:05,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 04:45:05,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step17000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 04:45:05,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! 0: successfully saved checkpoint at iteration 17000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 6247.46 63: iteration 17010/ 24424 | consumed samples: 8709120 | consumed tokens: 17836277760 | elapsed time per iteration (s): 3.47 | learning rate: 5.863E-05 | global batch size: 512 | lm loss: 2.038729E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 147.376 | TFLOPs: 15.17 | 63: iteration 17020/ 24424 | consumed samples: 8714240 | consumed tokens: 17846763520 | elapsed time per iteration (s): 2.23 | learning rate: 5.853E-05 | global batch size: 512 | lm loss: 2.030290E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.858 | TFLOPs: 23.66 | 63: iteration 17030/ 24424 | consumed samples: 8719360 | consumed tokens: 17857249280 | elapsed time per iteration (s): 2.24 | learning rate: 5.844E-05 | global batch size: 512 | lm loss: 2.011209E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.568 | TFLOPs: 23.53 | 63: iteration 17040/ 24424 | consumed samples: 8724480 | consumed tokens: 17867735040 | elapsed time per iteration (s): 2.23 | learning rate: 5.834E-05 | global batch size: 512 | lm loss: 2.025099E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.506 | TFLOPs: 23.63 | 63: iteration 17050/ 24424 | consumed samples: 8729600 | consumed tokens: 17878220800 | elapsed time per iteration (s): 2.25 | learning rate: 5.825E-05 | global batch size: 512 | lm loss: 2.031804E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.384 | TFLOPs: 23.41 | 63: iteration 17060/ 24424 | consumed samples: 8734720 | consumed tokens: 17888706560 | elapsed time per iteration (s): 2.29 | learning rate: 5.815E-05 | global batch size: 512 | lm loss: 2.018603E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.048 | TFLOPs: 23.06 | 63: iteration 17070/ 24424 | consumed samples: 8739840 | consumed tokens: 17899192320 | elapsed time per iteration (s): 2.23 | learning rate: 5.806E-05 | global batch size: 512 | lm loss: 2.022087E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.778 | TFLOPs: 23.65 | 63: iteration 17080/ 24424 | consumed samples: 8744960 | consumed tokens: 17909678080 | elapsed time per iteration (s): 2.23 | learning rate: 5.796E-05 | global batch size: 512 | lm loss: 2.017959E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.333 | TFLOPs: 23.61 | 63: iteration 17090/ 24424 | consumed samples: 8750080 | consumed tokens: 17920163840 | elapsed time per iteration (s): 2.23 | learning rate: 5.786E-05 | global batch size: 512 | lm loss: 2.040243E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.243 | TFLOPs: 23.60 | 63: iteration 17100/ 24424 | consumed samples: 8755200 | consumed tokens: 17930649600 | elapsed time per iteration (s): 2.25 | learning rate: 5.777E-05 | global batch size: 512 | lm loss: 2.017276E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.915 | TFLOPs: 23.46 | 63: iteration 17110/ 24424 | consumed samples: 8760320 | consumed tokens: 17941135360 | elapsed time per iteration (s): 2.23 | learning rate: 5.767E-05 | global batch size: 512 | lm loss: 2.022099E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.500 | TFLOPs: 23.63 | 63: iteration 17120/ 24424 | consumed samples: 8765440 | consumed tokens: 17951621120 | elapsed time per iteration (s): 2.23 | learning rate: 5.758E-05 | global batch size: 512 | lm loss: 2.003775E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.351 | TFLOPs: 23.61 | 63: iteration 17130/ 24424 | consumed samples: 8770560 | consumed tokens: 17962106880 | elapsed time per iteration (s): 2.24 | learning rate: 5.748E-05 | global batch size: 512 | lm loss: 2.021328E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.187 | TFLOPs: 23.49 | 63: iteration 17140/ 24424 | consumed samples: 8775680 | consumed tokens: 17972592640 | elapsed time per iteration (s): 2.32 | learning rate: 5.739E-05 | global batch size: 512 | lm loss: 2.037383E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.816 | TFLOPs: 22.73 | 63: iteration 17150/ 24424 | consumed samples: 8780800 | consumed tokens: 17983078400 | elapsed time per iteration (s): 2.24 | learning rate: 5.729E-05 | global batch size: 512 | lm loss: 2.045922E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.812 | TFLOPs: 23.56 | 63: iteration 17160/ 24424 | consumed samples: 8785920 | consumed tokens: 17993564160 | elapsed time per iteration (s): 2.24 | learning rate: 5.720E-05 | global batch size: 512 | lm loss: 2.044536E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.050 | TFLOPs: 23.58 | 63: iteration 17170/ 24424 | consumed samples: 8791040 | consumed tokens: 18004049920 | elapsed time per iteration (s): 2.77 | learning rate: 5.711E-05 | global batch size: 512 | lm loss: 2.023685E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.116 | TFLOPs: 19.06 | 63: iteration 17180/ 24424 | consumed samples: 8796160 | consumed tokens: 18014535680 | elapsed time per iteration (s): 2.24 | learning rate: 5.701E-05 | global batch size: 512 | lm loss: 2.001332E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.193 | TFLOPs: 23.49 | 63: iteration 17190/ 24424 | consumed samples: 8801280 | consumed tokens: 18025021440 | elapsed time per iteration (s): 2.25 | learning rate: 5.692E-05 | global batch size: 512 | lm loss: 2.043009E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.291 | TFLOPs: 23.40 | 63: iteration 17200/ 24424 | consumed samples: 8806400 | consumed tokens: 18035507200 | elapsed time per iteration (s): 2.25 | learning rate: 5.682E-05 | global batch size: 512 | lm loss: 2.023598E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.060 | TFLOPs: 23.37 | 63: iteration 17210/ 24424 | consumed samples: 8811520 | consumed tokens: 18045992960 | elapsed time per iteration (s): 2.23 | learning rate: 5.673E-05 | global batch size: 512 | lm loss: 2.034457E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.030 | TFLOPs: 23.68 | 63: iteration 17220/ 24424 | consumed samples: 8816640 | consumed tokens: 18056478720 | elapsed time per iteration (s): 2.23 | learning rate: 5.663E-05 | global batch size: 512 | lm loss: 2.035633E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.217 | TFLOPs: 23.60 | 63: iteration 17230/ 24424 | consumed samples: 8821760 | consumed tokens: 18066964480 | elapsed time per iteration (s): 2.26 | learning rate: 5.654E-05 | global batch size: 512 | lm loss: 2.016836E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.296 | TFLOPs: 23.30 | 63: iteration 17240/ 24424 | consumed samples: 8826880 | consumed tokens: 18077450240 | elapsed time per iteration (s): 2.24 | learning rate: 5.645E-05 | global batch size: 512 | lm loss: 1.998746E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.353 | TFLOPs: 23.51 | 63: iteration 17250/ 24424 | consumed samples: 8832000 | consumed tokens: 18087936000 | elapsed time per iteration (s): 2.24 | learning rate: 5.635E-05 | global batch size: 512 | lm loss: 2.001849E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.527 | TFLOPs: 23.53 | 63: iteration 17260/ 24424 | consumed samples: 8837120 | consumed tokens: 18098421760 | elapsed time per iteration (s): 2.23 | learning rate: 5.626E-05 | global batch size: 512 | lm loss: 2.002247E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.198 | TFLOPs: 23.59 | 63: iteration 17270/ 24424 | consumed samples: 8842240 | consumed tokens: 18108907520 | elapsed time per iteration (s): 2.25 | learning rate: 5.616E-05 | global batch size: 512 | lm loss: 2.016018E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.700 | TFLOPs: 23.44 | 63: iteration 17280/ 24424 | consumed samples: 8847360 | consumed tokens: 18119393280 | elapsed time per iteration (s): 2.25 | learning rate: 5.607E-05 | global batch size: 512 | lm loss: 2.018907E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.640 | TFLOPs: 23.43 | 63: iteration 17290/ 24424 | consumed samples: 8852480 | consumed tokens: 18129879040 | elapsed time per iteration (s): 2.24 | learning rate: 5.598E-05 | global batch size: 512 | lm loss: 2.030545E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.056 | TFLOPs: 23.58 | 63: iteration 17300/ 24424 | consumed samples: 8857600 | consumed tokens: 18140364800 | elapsed time per iteration (s): 2.35 | learning rate: 5.588E-05 | global batch size: 512 | lm loss: 2.002807E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 217.490 | TFLOPs: 22.39 | 63: iteration 17310/ 24424 | consumed samples: 8862720 | consumed tokens: 18150850560 | elapsed time per iteration (s): 2.25 | learning rate: 5.579E-05 | global batch size: 512 | lm loss: 1.989781E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.386 | TFLOPs: 23.41 | 63: iteration 17320/ 24424 | consumed samples: 8867840 | consumed tokens: 18161336320 | elapsed time per iteration (s): 2.28 | learning rate: 5.570E-05 | global batch size: 512 | lm loss: 2.019480E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.819 | TFLOPs: 23.14 | 63: iteration 17330/ 24424 | consumed samples: 8872960 | consumed tokens: 18171822080 | elapsed time per iteration (s): 2.29 | learning rate: 5.560E-05 | global batch size: 512 | lm loss: 2.023476E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.984 | TFLOPs: 23.06 | 63: iteration 17340/ 24424 | consumed samples: 8878080 | consumed tokens: 18182307840 | elapsed time per iteration (s): 2.25 | learning rate: 5.551E-05 | global batch size: 512 | lm loss: 2.025002E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.925 | TFLOPs: 23.46 | 63: iteration 17350/ 24424 | consumed samples: 8883200 | consumed tokens: 18192793600 | elapsed time per iteration (s): 2.24 | learning rate: 5.542E-05 | global batch size: 512 | lm loss: 2.015142E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.718 | TFLOPs: 23.55 | 63: iteration 17360/ 24424 | consumed samples: 8888320 | consumed tokens: 18203279360 | elapsed time per iteration (s): 2.25 | learning rate: 5.532E-05 | global batch size: 512 | lm loss: 2.024497E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.335 | TFLOPs: 23.40 | 63: iteration 17370/ 24424 | consumed samples: 8893440 | consumed tokens: 18213765120 | elapsed time per iteration (s): 2.23 | learning rate: 5.523E-05 | global batch size: 512 | lm loss: 2.031250E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.566 | TFLOPs: 23.63 | 63: iteration 17380/ 24424 | consumed samples: 8898560 | consumed tokens: 18224250880 | elapsed time per iteration (s): 2.33 | learning rate: 5.514E-05 | global batch size: 512 | lm loss: 2.019171E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.725 | TFLOPs: 22.62 | 63: iteration 17390/ 24424 | consumed samples: 8903680 | consumed tokens: 18234736640 | elapsed time per iteration (s): 2.27 | learning rate: 5.505E-05 | global batch size: 512 | lm loss: 1.995476E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.066 | TFLOPs: 23.17 | 63: iteration 17400/ 24424 | consumed samples: 8908800 | consumed tokens: 18245222400 | elapsed time per iteration (s): 2.27 | learning rate: 5.495E-05 | global batch size: 512 | lm loss: 2.011627E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.908 | TFLOPs: 23.26 | 63: iteration 17410/ 24424 | consumed samples: 8913920 | consumed tokens: 18255708160 | elapsed time per iteration (s): 2.28 | learning rate: 5.486E-05 | global batch size: 512 | lm loss: 2.010797E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.918 | TFLOPs: 23.15 | 63: iteration 17420/ 24424 | consumed samples: 8919040 | consumed tokens: 18266193920 | elapsed time per iteration (s): 2.23 | learning rate: 5.477E-05 | global batch size: 512 | lm loss: 2.012780E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.923 | TFLOPs: 23.67 | 63: iteration 17430/ 24424 | consumed samples: 8924160 | consumed tokens: 18276679680 | elapsed time per iteration (s): 2.24 | learning rate: 5.468E-05 | global batch size: 512 | lm loss: 2.025672E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.401 | TFLOPs: 23.51 | 63: iteration 17440/ 24424 | consumed samples: 8929280 | consumed tokens: 18287165440 | elapsed time per iteration (s): 2.23 | learning rate: 5.458E-05 | global batch size: 512 | lm loss: 2.028345E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.103 | TFLOPs: 23.69 | 63: iteration 17450/ 24424 | consumed samples: 8934400 | consumed tokens: 18297651200 | elapsed time per iteration (s): 2.25 | learning rate: 5.449E-05 | global batch size: 512 | lm loss: 2.034826E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.736 | TFLOPs: 23.44 | 63: iteration 17460/ 24424 | consumed samples: 8939520 | consumed tokens: 18308136960 | elapsed time per iteration (s): 2.35 | learning rate: 5.440E-05 | global batch size: 512 | lm loss: 2.018681E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 217.553 | TFLOPs: 22.40 | 63: iteration 17470/ 24424 | consumed samples: 8944640 | consumed tokens: 18318622720 | elapsed time per iteration (s): 2.23 | learning rate: 5.431E-05 | global batch size: 512 | lm loss: 2.018729E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.917 | TFLOPs: 23.67 | 63: iteration 17480/ 24424 | consumed samples: 8949760 | consumed tokens: 18329108480 | elapsed time per iteration (s): 2.25 | learning rate: 5.422E-05 | global batch size: 512 | lm loss: 2.015505E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.434 | TFLOPs: 23.41 | 63: iteration 17490/ 24424 | consumed samples: 8954880 | consumed tokens: 18339594240 | elapsed time per iteration (s): 2.26 | learning rate: 5.412E-05 | global batch size: 512 | lm loss: 2.034981E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.828 | TFLOPs: 23.35 | 63: iteration 17500/ 24424 | consumed samples: 8960000 | consumed tokens: 18350080000 | elapsed time per iteration (s): 2.23 | learning rate: 5.403E-05 | global batch size: 512 | lm loss: 2.049366E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.340 | TFLOPs: 23.61 | 63: iteration 17510/ 24424 | consumed samples: 8965120 | consumed tokens: 18360565760 | elapsed time per iteration (s): 2.23 | learning rate: 5.394E-05 | global batch size: 512 | lm loss: 2.036353E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.454 | TFLOPs: 23.62 | 63: iteration 17520/ 24424 | consumed samples: 8970240 | consumed tokens: 18371051520 | elapsed time per iteration (s): 2.23 | learning rate: 5.385E-05 | global batch size: 512 | lm loss: 2.024450E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.086 | TFLOPs: 23.58 | 63: iteration 17530/ 24424 | consumed samples: 8975360 | consumed tokens: 18381537280 | elapsed time per iteration (s): 2.23 | learning rate: 5.376E-05 | global batch size: 512 | lm loss: 2.027443E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.233 | TFLOPs: 23.60 | 63: iteration 17540/ 24424 | consumed samples: 8980480 | consumed tokens: 18392023040 | elapsed time per iteration (s): 2.28 | learning rate: 5.367E-05 | global batch size: 512 | lm loss: 2.013190E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.372 | TFLOPs: 23.10 | 63: iteration 17550/ 24424 | consumed samples: 8985600 | consumed tokens: 18402508800 | elapsed time per iteration (s): 2.26 | learning rate: 5.358E-05 | global batch size: 512 | lm loss: 2.002406E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.221 | TFLOPs: 23.29 | 63: iteration 17560/ 24424 | consumed samples: 8990720 | consumed tokens: 18412994560 | elapsed time per iteration (s): 2.25 | learning rate: 5.349E-05 | global batch size: 512 | lm loss: 2.007113E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.629 | TFLOPs: 23.43 | 63: iteration 17570/ 24424 | consumed samples: 8995840 | consumed tokens: 18423480320 | elapsed time per iteration (s): 2.24 | learning rate: 5.339E-05 | global batch size: 512 | lm loss: 2.025374E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.579 | TFLOPs: 23.53 | 63: iteration 17580/ 24424 | consumed samples: 9000960 | consumed tokens: 18433966080 | elapsed time per iteration (s): 2.25 | learning rate: 5.330E-05 | global batch size: 512 | lm loss: 2.015930E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.354 | TFLOPs: 23.40 | 63: iteration 17590/ 24424 | consumed samples: 9006080 | consumed tokens: 18444451840 | elapsed time per iteration (s): 2.24 | learning rate: 5.321E-05 | global batch size: 512 | lm loss: 2.005250E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.728 | TFLOPs: 23.55 | 63: iteration 17600/ 24424 | consumed samples: 9011200 | consumed tokens: 18454937600 | elapsed time per iteration (s): 2.23 | learning rate: 5.312E-05 | global batch size: 512 | lm loss: 2.029000E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.502 | TFLOPs: 23.63 | 63: iteration 17610/ 24424 | consumed samples: 9016320 | consumed tokens: 18465423360 | elapsed time per iteration (s): 2.23 | learning rate: 5.303E-05 | global batch size: 512 | lm loss: 2.021030E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.979 | TFLOPs: 23.68 | 63: iteration 17620/ 24424 | consumed samples: 9021440 | consumed tokens: 18475909120 | elapsed time per iteration (s): 2.34 | learning rate: 5.294E-05 | global batch size: 512 | lm loss: 2.013115E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.812 | TFLOPs: 22.53 | 63: iteration 17630/ 24424 | consumed samples: 9026560 | consumed tokens: 18486394880 | elapsed time per iteration (s): 2.24 | learning rate: 5.285E-05 | global batch size: 512 | lm loss: 2.008212E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.398 | TFLOPs: 23.51 | 63: iteration 17640/ 24424 | consumed samples: 9031680 | consumed tokens: 18496880640 | elapsed time per iteration (s): 2.25 | learning rate: 5.276E-05 | global batch size: 512 | lm loss: 2.020408E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.309 | TFLOPs: 23.40 | 63: iteration 17650/ 24424 | consumed samples: 9036800 | consumed tokens: 18507366400 | elapsed time per iteration (s): 2.28 | learning rate: 5.267E-05 | global batch size: 512 | lm loss: 2.011683E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.618 | TFLOPs: 23.12 | 63: iteration 17660/ 24424 | consumed samples: 9041920 | consumed tokens: 18517852160 | elapsed time per iteration (s): 2.23 | learning rate: 5.258E-05 | global batch size: 512 | lm loss: 2.022387E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.290 | TFLOPs: 23.60 | 63: iteration 17670/ 24424 | consumed samples: 9047040 | consumed tokens: 18528337920 | elapsed time per iteration (s): 2.23 | learning rate: 5.249E-05 | global batch size: 512 | lm loss: 2.011607E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.689 | TFLOPs: 23.65 | 63: iteration 17680/ 24424 | consumed samples: 9052160 | consumed tokens: 18538823680 | elapsed time per iteration (s): 2.25 | learning rate: 5.240E-05 | global batch size: 512 | lm loss: 2.008039E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.916 | TFLOPs: 23.46 | 63: iteration 17690/ 24424 | consumed samples: 9057280 | consumed tokens: 18549309440 | elapsed time per iteration (s): 2.24 | learning rate: 5.231E-05 | global batch size: 512 | lm loss: 2.012469E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.172 | TFLOPs: 23.49 | 63: iteration 17700/ 24424 | consumed samples: 9062400 | consumed tokens: 18559795200 | elapsed time per iteration (s): 2.24 | learning rate: 5.222E-05 | global batch size: 512 | lm loss: 2.033855E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.941 | TFLOPs: 23.57 | 63: iteration 17710/ 24424 | consumed samples: 9067520 | consumed tokens: 18570280960 | elapsed time per iteration (s): 2.30 | learning rate: 5.213E-05 | global batch size: 512 | lm loss: 2.011110E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.008 | TFLOPs: 22.96 | 63: iteration 17720/ 24424 | consumed samples: 9072640 | consumed tokens: 18580766720 | elapsed time per iteration (s): 2.29 | learning rate: 5.204E-05 | global batch size: 512 | lm loss: 1.996302E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.430 | TFLOPs: 23.00 | 63: iteration 17730/ 24424 | consumed samples: 9077760 | consumed tokens: 18591252480 | elapsed time per iteration (s): 2.29 | learning rate: 5.195E-05 | global batch size: 512 | lm loss: 1.994740E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.995 | TFLOPs: 23.06 | 63: iteration 17740/ 24424 | consumed samples: 9082880 | consumed tokens: 18601738240 | elapsed time per iteration (s): 2.24 | learning rate: 5.186E-05 | global batch size: 512 | lm loss: 2.025628E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.240 | TFLOPs: 23.50 | 63: iteration 17750/ 24424 | consumed samples: 9088000 | consumed tokens: 18612224000 | elapsed time per iteration (s): 2.23 | learning rate: 5.177E-05 | global batch size: 512 | lm loss: 2.028324E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.245 | TFLOPs: 23.60 | 63: iteration 17760/ 24424 | consumed samples: 9093120 | consumed tokens: 18622709760 | elapsed time per iteration (s): 2.23 | learning rate: 5.168E-05 | global batch size: 512 | lm loss: 2.016018E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.131 | TFLOPs: 23.59 | 63: iteration 17770/ 24424 | consumed samples: 9098240 | consumed tokens: 18633195520 | elapsed time per iteration (s): 2.24 | learning rate: 5.160E-05 | global batch size: 512 | lm loss: 2.005242E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.387 | TFLOPs: 23.51 | 63: iteration 17780/ 24424 | consumed samples: 9103360 | consumed tokens: 18643681280 | elapsed time per iteration (s): 2.32 | learning rate: 5.151E-05 | global batch size: 512 | lm loss: 2.029090E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.359 | TFLOPs: 22.68 | 63: iteration 17790/ 24424 | consumed samples: 9108480 | consumed tokens: 18654167040 | elapsed time per iteration (s): 2.25 | learning rate: 5.142E-05 | global batch size: 512 | lm loss: 2.024499E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.672 | TFLOPs: 23.44 | 63: iteration 17800/ 24424 | consumed samples: 9113600 | consumed tokens: 18664652800 | elapsed time per iteration (s): 3.87 | learning rate: 5.133E-05 | global batch size: 512 | lm loss: 1.998093E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 132.339 | TFLOPs: 13.62 | 63: iteration 17810/ 24424 | consumed samples: 9118720 | consumed tokens: 18675138560 | elapsed time per iteration (s): 2.24 | learning rate: 5.124E-05 | global batch size: 512 | lm loss: 2.014368E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.479 | TFLOPs: 23.52 | 63: iteration 17820/ 24424 | consumed samples: 9123840 | consumed tokens: 18685624320 | elapsed time per iteration (s): 2.26 | learning rate: 5.115E-05 | global batch size: 512 | lm loss: 2.015385E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.376 | TFLOPs: 23.30 | 63: iteration 17830/ 24424 | consumed samples: 9128960 | consumed tokens: 18696110080 | elapsed time per iteration (s): 2.23 | learning rate: 5.106E-05 | global batch size: 512 | lm loss: 2.016738E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.985 | TFLOPs: 23.68 | 63: iteration 17840/ 24424 | consumed samples: 9134080 | consumed tokens: 18706595840 | elapsed time per iteration (s): 2.26 | learning rate: 5.097E-05 | global batch size: 512 | lm loss: 2.019877E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.111 | TFLOPs: 23.28 | 63: iteration 17850/ 24424 | consumed samples: 9139200 | consumed tokens: 18717081600 | elapsed time per iteration (s): 2.25 | learning rate: 5.089E-05 | global batch size: 512 | lm loss: 1.992898E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.614 | TFLOPs: 23.43 | 63: iteration 17860/ 24424 | consumed samples: 9144320 | consumed tokens: 18727567360 | elapsed time per iteration (s): 2.25 | learning rate: 5.080E-05 | global batch size: 512 | lm loss: 2.005732E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.034 | TFLOPs: 23.47 | 63: iteration 17870/ 24424 | consumed samples: 9149440 | consumed tokens: 18738053120 | elapsed time per iteration (s): 2.24 | learning rate: 5.071E-05 | global batch size: 512 | lm loss: 2.029835E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.208 | TFLOPs: 23.49 | 63: iteration 17880/ 24424 | consumed samples: 9154560 | consumed tokens: 18748538880 | elapsed time per iteration (s): 2.23 | learning rate: 5.062E-05 | global batch size: 512 | lm loss: 2.013773E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.126 | TFLOPs: 23.59 | 63: iteration 17890/ 24424 | consumed samples: 9159680 | consumed tokens: 18759024640 | elapsed time per iteration (s): 2.25 | learning rate: 5.053E-05 | global batch size: 512 | lm loss: 2.000055E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.315 | TFLOPs: 23.40 | 63: iteration 17900/ 24424 | consumed samples: 9164800 | consumed tokens: 18769510400 | elapsed time per iteration (s): 2.26 | learning rate: 5.045E-05 | global batch size: 512 | lm loss: 2.005660E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.679 | TFLOPs: 23.34 | 63: iteration 17910/ 24424 | consumed samples: 9169920 | consumed tokens: 18779996160 | elapsed time per iteration (s): 2.24 | learning rate: 5.036E-05 | global batch size: 512 | lm loss: 2.013263E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.481 | TFLOPs: 23.52 | 63: iteration 17920/ 24424 | consumed samples: 9175040 | consumed tokens: 18790481920 | elapsed time per iteration (s): 2.23 | learning rate: 5.027E-05 | global batch size: 512 | lm loss: 1.987043E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.663 | TFLOPs: 23.64 | 63: iteration 17930/ 24424 | consumed samples: 9180160 | consumed tokens: 18800967680 | elapsed time per iteration (s): 2.32 | learning rate: 5.018E-05 | global batch size: 512 | lm loss: 2.018223E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.226 | TFLOPs: 22.67 | 63: iteration 17940/ 24424 | consumed samples: 9185280 | consumed tokens: 18811453440 | elapsed time per iteration (s): 2.25 | learning rate: 5.010E-05 | global batch size: 512 | lm loss: 2.024501E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.037 | TFLOPs: 23.48 | 63: iteration 17950/ 24424 | consumed samples: 9190400 | consumed tokens: 18821939200 | elapsed time per iteration (s): 2.24 | learning rate: 5.001E-05 | global batch size: 512 | lm loss: 2.011208E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.911 | TFLOPs: 23.57 | 63: iteration 17960/ 24424 | consumed samples: 9195520 | consumed tokens: 18832424960 | elapsed time per iteration (s): 2.32 | learning rate: 4.992E-05 | global batch size: 512 | lm loss: 2.015370E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.137 | TFLOPs: 22.77 | 63: iteration 17970/ 24424 | consumed samples: 9200640 | consumed tokens: 18842910720 | elapsed time per iteration (s): 2.24 | learning rate: 4.984E-05 | global batch size: 512 | lm loss: 1.998117E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.781 | TFLOPs: 23.55 | 63: iteration 17980/ 24424 | consumed samples: 9205760 | consumed tokens: 18853396480 | elapsed time per iteration (s): 2.29 | learning rate: 4.975E-05 | global batch size: 512 | lm loss: 2.029686E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.574 | TFLOPs: 23.02 | 63: iteration 17990/ 24424 | consumed samples: 9210880 | consumed tokens: 18863882240 | elapsed time per iteration (s): 2.23 | learning rate: 4.966E-05 | global batch size: 512 | lm loss: 2.007504E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.709 | TFLOPs: 23.65 | 0: [2022-11-26 05:23:05,213] [INFO] [logging.py:68:log_dist] [Rank 0] step=18000, skipped=0, lr=[4.957547611641215e-05, 4.957547611641215e-05, 4.957547611641215e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 18000/ 24424 | consumed samples: 9216000 | consumed tokens: 18874368000 | elapsed time per iteration (s): 2.23 | learning rate: 4.958E-05 | global batch size: 512 | lm loss: 2.013498E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.082 | TFLOPs: 23.69 | 0: steps: 18000 loss: 2.0313 iter time (s): 2.278 samples/sec: 224.733 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 18000 | lm loss value: 2.012289E+00 | lm loss PPL: 7.480423E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 18000 to checkpoints_3b9 0: [2022-11-26 05:23:05,929] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is begin to save! 0: [2022-11-26 05:23:05,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_01-model_00-model_states.pt... 32: [2022-11-26 05:23:05,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_21-model_00-model_states.pt... 32: [2022-11-26 05:23:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_21-model_00-model_states.pt. 32: [2022-11-26 05:23:06,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_22-model_00-model_states.pt... 0: [2022-11-26 05:23:06,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_01-model_00-model_states.pt. 0: [2022-11-26 05:23:06,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_03-model_00-model_states.pt... 32: [2022-11-26 05:23:06,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_22-model_00-model_states.pt. 32: [2022-11-26 05:23:06,426] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_23-model_00-model_states.pt... 0: [2022-11-26 05:23:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_03-model_00-model_states.pt. 0: [2022-11-26 05:23:06,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_04-model_00-model_states.pt... 32: [2022-11-26 05:23:06,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_23-model_00-model_states.pt. 32: [2022-11-26 05:23:06,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_24-model_00-model_states.pt... 0: [2022-11-26 05:23:06,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_04-model_00-model_states.pt. 0: [2022-11-26 05:23:06,830] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_05-model_00-model_states.pt... 32: [2022-11-26 05:23:06,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_24-model_00-model_states.pt. 32: [2022-11-26 05:23:06,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_25-model_00-model_states.pt... 0: [2022-11-26 05:23:07,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_05-model_00-model_states.pt. 0: [2022-11-26 05:23:07,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_06-model_00-model_states.pt... 32: [2022-11-26 05:23:07,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_25-model_00-model_states.pt. 32: [2022-11-26 05:23:07,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_26-model_00-model_states.pt... 0: [2022-11-26 05:23:07,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_06-model_00-model_states.pt. 0: [2022-11-26 05:23:07,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_07-model_00-model_states.pt... 32: [2022-11-26 05:23:07,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_26-model_00-model_states.pt. 32: [2022-11-26 05:23:07,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_27-model_00-model_states.pt... 0: [2022-11-26 05:23:07,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_07-model_00-model_states.pt. 0: [2022-11-26 05:23:07,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_08-model_00-model_states.pt... 32: [2022-11-26 05:23:07,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_27-model_00-model_states.pt. 32: [2022-11-26 05:23:07,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_28-model_00-model_states.pt... 0: [2022-11-26 05:23:07,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_08-model_00-model_states.pt. 0: [2022-11-26 05:23:07,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_09-model_00-model_states.pt... 32: [2022-11-26 05:23:07,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_28-model_00-model_states.pt. 32: [2022-11-26 05:23:07,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_29-model_00-model_states.pt... 0: [2022-11-26 05:23:08,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_09-model_00-model_states.pt. 0: [2022-11-26 05:23:08,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_10-model_00-model_states.pt... 32: [2022-11-26 05:23:08,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_29-model_00-model_states.pt. 32: [2022-11-26 05:23:08,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_30-model_00-model_states.pt... 0: [2022-11-26 05:23:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_10-model_00-model_states.pt. 0: [2022-11-26 05:23:08,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_11-model_00-model_states.pt... 32: [2022-11-26 05:23:08,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_30-model_00-model_states.pt. 32: [2022-11-26 05:23:08,282] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_31-model_00-model_states.pt... 0: [2022-11-26 05:23:08,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_11-model_00-model_states.pt. 0: [2022-11-26 05:23:08,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_12-model_00-model_states.pt... 32: [2022-11-26 05:23:08,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_31-model_00-model_states.pt. 32: [2022-11-26 05:23:08,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_32-model_00-model_states.pt... 0: [2022-11-26 05:23:08,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_12-model_00-model_states.pt. 0: [2022-11-26 05:23:08,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_13-model_00-model_states.pt... 32: [2022-11-26 05:23:08,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_32-model_00-model_states.pt. 32: [2022-11-26 05:23:08,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_33-model_00-model_states.pt... 0: [2022-11-26 05:23:08,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_13-model_00-model_states.pt. 0: [2022-11-26 05:23:08,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_14-model_00-model_states.pt... 32: [2022-11-26 05:23:08,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_33-model_00-model_states.pt. 32: [2022-11-26 05:23:08,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_34-model_00-model_states.pt... 0: [2022-11-26 05:23:09,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_14-model_00-model_states.pt. 0: [2022-11-26 05:23:09,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_15-model_00-model_states.pt... 32: [2022-11-26 05:23:09,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_34-model_00-model_states.pt. 32: [2022-11-26 05:23:09,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_35-model_00-model_states.pt... 0: [2022-11-26 05:23:09,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_15-model_00-model_states.pt. 0: [2022-11-26 05:23:09,339] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_16-model_00-model_states.pt... 32: [2022-11-26 05:23:09,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_35-model_00-model_states.pt. 32: [2022-11-26 05:23:09,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_36-model_00-model_states.pt... 0: [2022-11-26 05:23:09,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_16-model_00-model_states.pt. 0: [2022-11-26 05:23:09,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_17-model_00-model_states.pt... 32: [2022-11-26 05:23:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_36-model_00-model_states.pt. 32: [2022-11-26 05:23:09,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_37-model_00-model_states.pt... 0: [2022-11-26 05:23:09,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_17-model_00-model_states.pt. 0: [2022-11-26 05:23:09,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_18-model_00-model_states.pt... 32: [2022-11-26 05:23:09,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_37-model_00-model_states.pt. 32: [2022-11-26 05:23:09,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_38-model_00-model_states.pt... 0: [2022-11-26 05:23:09,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_18-model_00-model_states.pt. 0: [2022-11-26 05:23:09,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_19-model_00-model_states.pt... 32: [2022-11-26 05:23:10,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_38-model_00-model_states.pt. 32: [2022-11-26 05:23:10,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_40-model_00-model_states.pt... 32: [2022-11-26 05:23:10,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_40-model_00-model_states.pt. 32: [2022-11-26 05:23:10,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/mp_rank_01_model_states.pt... 32: [2022-11-26 05:23:10,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/mp_rank_01_model_states.pt. 0: [2022-11-26 05:23:10,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_19-model_00-model_states.pt. 0: [2022-11-26 05:23:10,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/layer_20-model_00-model_states.pt... 0: [2022-11-26 05:23:10,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/layer_20-model_00-model_states.pt. 0: [2022-11-26 05:23:10,431] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step18000/mp_rank_00_model_states.pt 0: [2022-11-26 05:23:10,431] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/mp_rank_00_model_states.pt... 0: [2022-11-26 05:23:10,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/mp_rank_00_model_states.pt. 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 51: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 57: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 54: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 58: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 34: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 44: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 46: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 48: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 45: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 38: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 6: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 15: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 17: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 32: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 63: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 1: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 30: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 37: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 16: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 18: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 13: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 31: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 2: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 10: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 3: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 24: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 05:23:10,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step18000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 0: [2022-11-26 05:23:10,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 05:23:10,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 05:23:10,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:10,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:10,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 05:23:10,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 05:23:10,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:10,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:10,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 05:23:10,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:10,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:10,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:10,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 05:23:10,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 05:23:10,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:10,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:10,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:10,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 4: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 58: [2022-11-26 05:23:10,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 4: [2022-11-26 05:23:10,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 58: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 4: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 23: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 05:23:10,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 42: [2022-11-26 05:23:10,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 23: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 05:23:10,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 31: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:10,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:10,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:10,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:10,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 05:23:10,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:10,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 05:23:10,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:10,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 48: [2022-11-26 05:23:10,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 0: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 40: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 8: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 40: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 8: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 38: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 0: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 32: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:10,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-26 05:23:10,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 05:23:10,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 05:23:10,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:10,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 05:23:10,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:10,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 05:23:10,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:10,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 05:23:10,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:10,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:10,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 05:23:10,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 05:23:10,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 05:23:10,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 17: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:10,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 05:23:10,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:10,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 43: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 16: [2022-11-26 05:23:10,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 43: [2022-11-26 05:23:10,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 5: [2022-11-26 05:23:10,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 43: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 5: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 16: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 4: [2022-11-26 05:23:10,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 44: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 19: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 4: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 19: [2022-11-26 05:23:10,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 44: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 61: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 13: [2022-11-26 05:23:10,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 61: [2022-11-26 05:23:10,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 58: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 13: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 05:23:10,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 33: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 21: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 51: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 21: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 12: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 51: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 26: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 53: [2022-11-26 05:23:10,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 26: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 53: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:10,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 05:23:10,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:10,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:10,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:10,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:10,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:10,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:10,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 62: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 23: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 62: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 31: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:10,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:10,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:10,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 05:23:10,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 9: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 56: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:10,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 54: [2022-11-26 05:23:10,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:10,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 47: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 7: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 05:23:10,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:10,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 05:23:10,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 41: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 05:23:10,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:10,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 41: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:10,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 41: [2022-11-26 05:23:10,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 05:23:10,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:10,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:10,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 05:23:10,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 54: [2022-11-26 05:23:10,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 19: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 44: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 19: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 44: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 05:23:10,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:10,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:10,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 05:23:10,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 05:23:10,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 05:23:10,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:10,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:10,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:10,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:10,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:10,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 0: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 57: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:10,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 58: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 23: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 58: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 23: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 05:23:10,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 55: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 28: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 28: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 30: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 55: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 9: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 9: [2022-11-26 05:23:10,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 30: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 31: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:10,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:10,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:10,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:10,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:10,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 20: [2022-11-26 05:23:10,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 62: [2022-11-26 05:23:10,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:10,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 4: [2022-11-26 05:23:10,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 05:23:10,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 05:23:10,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 05:23:10,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 05:23:10,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 47: [2022-11-26 05:23:10,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:10,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:10,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:10,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 2: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 34: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 2: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:10,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:10,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 42: [2022-11-26 05:23:10,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 0: [2022-11-26 05:23:10,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 05:23:10,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 05:23:10,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:10,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 30: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:10,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 3: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:10,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 05:23:10,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 4: [2022-11-26 05:23:10,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:10,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:10,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:10,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 05:23:10,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:10,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 17: [2022-11-26 05:23:10,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:10,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:10,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:10,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:10,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:10,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 05:23:10,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:10,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:10,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:10,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:10,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 05:23:10,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 14: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 05:23:10,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 31: [2022-11-26 05:23:10,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 05:23:10,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:10,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 05:23:10,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 05:23:10,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:10,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 8: [2022-11-26 05:23:10,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 34: [2022-11-26 05:23:10,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 8: [2022-11-26 05:23:10,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 05:23:10,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:10,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:10,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:10,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:10,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 05:23:10,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:10,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:10,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:10,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 05:23:10,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 05:23:10,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 05:23:10,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 05:23:10,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:10,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 05:23:10,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:10,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:10,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:10,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:10,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 05:23:10,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:10,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 05:23:10,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:10,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 05:23:10,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:10,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 05:23:10,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 05:23:10,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 05:23:10,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:10,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 27: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 41: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 41: [2022-11-26 05:23:10,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 27: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:10,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 05:23:10,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:10,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 05:23:10,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 05:23:10,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:10,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 05:23:10,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:10,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:10,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 05:23:10,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:10,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:10,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:10,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 54: [2022-11-26 05:23:10,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-26 05:23:10,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:10,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 05:23:10,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 9: [2022-11-26 05:23:10,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 47: [2022-11-26 05:23:10,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 05:23:10,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 35: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 42: [2022-11-26 05:23:10,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 35: [2022-11-26 05:23:10,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 5: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 05:23:10,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 0: [2022-11-26 05:23:10,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 62: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 48: [2022-11-26 05:23:10,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:10,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:10,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:10,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:10,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:10,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:10,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:10,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 05:23:10,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:10,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:10,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:10,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:10,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:10,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:10,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 05:23:10,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:10,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:10,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 05:23:10,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:10,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 05:23:10,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 05:23:10,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:10,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:10,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:10,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:10,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 05:23:10,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:10,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 17: [2022-11-26 05:23:10,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:10,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 05:23:10,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:10,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 31: [2022-11-26 05:23:10,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:10,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 4: [2022-11-26 05:23:10,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 05:23:10,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 20: [2022-11-26 05:23:10,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:10,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:10,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:10,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:10,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:10,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:10,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 05:23:10,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 05:23:10,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:10,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:10,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:10,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:10,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 05:23:10,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:10,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 05:23:10,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:10,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:10,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:10,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 05:23:10,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 8: [2022-11-26 05:23:10,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 8: [2022-11-26 05:23:10,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:10,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:10,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 05:23:10,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:10,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 05:23:10,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:10,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 05:23:10,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:10,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 05:23:10,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 05:23:10,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:10,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 05:23:10,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 05:23:10,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:10,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 05:23:10,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:10,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:10,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:10,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 05:23:10,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 05:23:10,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 05:23:10,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 05:23:10,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:10,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:10,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 05:23:10,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 48: [2022-11-26 05:23:10,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:10,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 05:23:10,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 05:23:10,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 05:23:10,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 05:23:10,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 05:23:10,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 62: [2022-11-26 05:23:10,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:10,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:10,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 05:23:10,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 05:23:10,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 05:23:10,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:10,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:10,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:10,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 05:23:10,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 05:23:10,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:10,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:10,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:10,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:10,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 05:23:10,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 05:23:10,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:10,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 05:23:10,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:10,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 05:23:10,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:10,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 05:23:10,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 05:23:10,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:10,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 05:23:10,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:10,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:10,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 21: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 05:23:10,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:10,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:10,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:10,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 05:23:10,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:10,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:10,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:10,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:10,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:10,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:10,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 05:23:10,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:10,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:10,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:10,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:10,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:10,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:10,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:10,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 05:23:10,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:10,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 17: [2022-11-26 05:23:10,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:10,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:10,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:10,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:10,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:10,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 05:23:10,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 05:23:10,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:10,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:10,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 4: [2022-11-26 05:23:10,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 05:23:10,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 05:23:10,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:10,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 13: [2022-11-26 05:23:10,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 56: [2022-11-26 05:23:10,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 13: [2022-11-26 05:23:10,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:10,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 8: [2022-11-26 05:23:10,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:10,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:10,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:10,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 05:23:10,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:10,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:10,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:10,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 31: [2022-11-26 05:23:10,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 05:23:10,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 05:23:10,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 05:23:10,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:10,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:10,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:10,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:10,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:10,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 23: [2022-11-26 05:23:10,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 58: [2022-11-26 05:23:10,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 23: [2022-11-26 05:23:10,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 58: [2022-11-26 05:23:10,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:10,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 05:23:10,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 05:23:10,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 05:23:10,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:10,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:10,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 05:23:10,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 36: [2022-11-26 05:23:10,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 05:23:10,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 26: [2022-11-26 05:23:10,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 48: [2022-11-26 05:23:10,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 22: [2022-11-26 05:23:10,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 46: [2022-11-26 05:23:10,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 22: [2022-11-26 05:23:10,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 05:23:10,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:10,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 05:23:10,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 05:23:10,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 05:23:10,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:10,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 05:23:10,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 62: [2022-11-26 05:23:10,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:10,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 52: [2022-11-26 05:23:10,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 25: [2022-11-26 05:23:10,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 52: [2022-11-26 05:23:10,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 25: [2022-11-26 05:23:10,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:10,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 05:23:10,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 05:23:10,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:10,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:10,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:10,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:10,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:10,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:10,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:10,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 05:23:10,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:10,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:10,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:10,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:10,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:10,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:10,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:10,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:10,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 05:23:10,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:10,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:10,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 05:23:10,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:10,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:10,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:10,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:10,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:10,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 05:23:10,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 05:23:10,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:10,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 05:23:10,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 05:23:10,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 17: [2022-11-26 05:23:10,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:10,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:10,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:10,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:10,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:10,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:10,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:10,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:10,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:10,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 30: [2022-11-26 05:23:10,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:10,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 4: [2022-11-26 05:23:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 05:23:10,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:10,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 05:23:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 31: [2022-11-26 05:23:10,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 05:23:10,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 05:23:10,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:10,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 05:23:10,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 05:23:10,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:10,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 05:23:10,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 05:23:10,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:10,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:10,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:10,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 37: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 14: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 37: [2022-11-26 05:23:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 14: [2022-11-26 05:23:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 20: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 34: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 8: [2022-11-26 05:23:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 10: [2022-11-26 05:23:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:10,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:10,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 05:23:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 05:23:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 05:23:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 23: [2022-11-26 05:23:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 42: [2022-11-26 05:23:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 28: [2022-11-26 05:23:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 23: [2022-11-26 05:23:10,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 28: [2022-11-26 05:23:10,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 42: [2022-11-26 05:23:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:10,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 05:23:10,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:10,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:10,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:10,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:10,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:10,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:10,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:10,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:10,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:10,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 05:23:10,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:10,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:10,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 05:23:10,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 05:23:10,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:10,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 05:23:10,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:10,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:10,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:10,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:10,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:10,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:10,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:10,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:10,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 05:23:10,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:10,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:10,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:10,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:10,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:10,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 05:23:10,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 05:23:10,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 51: [2022-11-26 05:23:10,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 05:23:10,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 05:23:10,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:10,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 05:23:10,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 05:23:10,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:10,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:10,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:10,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:10,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:10,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 21: [2022-11-26 05:23:10,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 46: [2022-11-26 05:23:10,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:10,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:10,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:10,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:10,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:10,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:10,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:10,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 05:23:10,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:10,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 05:23:10,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 05:23:10,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:10,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 05:23:10,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:10,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:10,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 05:23:10,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 05:23:10,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:10,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 05:23:10,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:10,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 48: [2022-11-26 05:23:10,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:10,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:10,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 62: [2022-11-26 05:23:10,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:10,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 05:23:10,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:10,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 05:23:10,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 05:23:10,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:10,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 05:23:10,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:10,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:10,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:10,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:10,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:10,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 05:23:10,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 05:23:10,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:11,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:11,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:11,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:11,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 40: [2022-11-26 05:23:11,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 05:23:11,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 05:23:11,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 21: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 05:23:11,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 2: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 05:23:11,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 25: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 0: [2022-11-26 05:23:11,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:11,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 0: [2022-11-26 05:23:11,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:11,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: [2022-11-26 05:23:11,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 45: [2022-11-26 05:23:11,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 05:23:11,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 05:23:11,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:11,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 05:23:11,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 05:23:11,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:11,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:11,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:11,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:11,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 05:23:11,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:11,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:11,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:11,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:11,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:11,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:11,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:11,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:11,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:11,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 05:23:11,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:11,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:11,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 51: [2022-11-26 05:23:11,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 30: [2022-11-26 05:23:11,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:11,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 51: [2022-11-26 05:23:11,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 05:23:11,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 5: [2022-11-26 05:23:11,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 05:23:11,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 1: [2022-11-26 05:23:11,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 05:23:11,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 05:23:11,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 62: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 4: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 29: [2022-11-26 05:23:11,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 4: [2022-11-26 05:23:11,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 62: [2022-11-26 05:23:11,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 4: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 29: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 62: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 38: [2022-11-26 05:23:11,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 05:23:11,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 33: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 33: [2022-11-26 05:23:11,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 3: [2022-11-26 05:23:11,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 33: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 3: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 39: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 44: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 39: [2022-11-26 05:23:11,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 44: [2022-11-26 05:23:11,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 39: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 44: [2022-11-26 05:23:11,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 57: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 58: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 57: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 58: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 7: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 32: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 7: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 41: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 50: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 7: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 32: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 41: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 50: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 28: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 41: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 50: [2022-11-26 05:23:11,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:11,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 36: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 05:23:11,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 27: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 05:23:11,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 24: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 25: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 12: [2022-11-26 05:23:11,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 25: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 12: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 24: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 22: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 24: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 22: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 30: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 14: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 9: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 14: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 9: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 37: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 9: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 37: [2022-11-26 05:23:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 19: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 10: [2022-11-26 05:23:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 19: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 10: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 19: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 60: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 10: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 60: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 26: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 42: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 54: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 18: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 42: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 54: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 18: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 42: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 54: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 18: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 31: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 16: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 31: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 16: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 56: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 05:23:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 55: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 20: [2022-11-26 05:23:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 55: [2022-11-26 05:23:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 20: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 55: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 15: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 05:23:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 11: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 05:23:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 59: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 05:23:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 23: [2022-11-26 05:23:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 35: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 23: [2022-11-26 05:23:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 35: [2022-11-26 05:23:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 23: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 35: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 20: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 05:23:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 13: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 05:23:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 28: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 8: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 05:23:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 17: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 05:23:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 34: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 05:23:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 05:23:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 52: [2022-11-26 05:23:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 05:23:11,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:11,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 05:23:11,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 61: [2022-11-26 05:23:11,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 49: [2022-11-26 05:23:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 05:23:11,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 05:23:11,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 46: [2022-11-26 05:23:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 05:23:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 05:23:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 47: [2022-11-26 05:23:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 05:23:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 05:23:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 63: [2022-11-26 05:23:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 05:23:11,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 05:23:11,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 48: [2022-11-26 05:23:11,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 05:23:11,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 05:23:11,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 6: [2022-11-26 05:23:11,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 05:23:11,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 05:23:11,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:11,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:11,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:11,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 43: [2022-11-26 05:23:11,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 05:23:11,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 05:23:11,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 53: [2022-11-26 05:23:11,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 05:23:11,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step18000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 05:23:11,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! 0: successfully saved checkpoint at iteration 18000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5138.37 63: iteration 18010/ 24424 | consumed samples: 9221120 | consumed tokens: 18884853760 | elapsed time per iteration (s): 2.83 | learning rate: 4.949E-05 | global batch size: 512 | lm loss: 2.004522E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.175 | TFLOPs: 18.65 | 63: iteration 18020/ 24424 | consumed samples: 9226240 | consumed tokens: 18895339520 | elapsed time per iteration (s): 2.24 | learning rate: 4.940E-05 | global batch size: 512 | lm loss: 1.999189E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.016 | TFLOPs: 23.58 | 63: iteration 18030/ 24424 | consumed samples: 9231360 | consumed tokens: 18905825280 | elapsed time per iteration (s): 2.24 | learning rate: 4.932E-05 | global batch size: 512 | lm loss: 2.027622E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.689 | TFLOPs: 23.54 | 63: iteration 18040/ 24424 | consumed samples: 9236480 | consumed tokens: 18916311040 | elapsed time per iteration (s): 2.24 | learning rate: 4.923E-05 | global batch size: 512 | lm loss: 2.009674E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.427 | TFLOPs: 23.52 | 63: iteration 18050/ 24424 | consumed samples: 9241600 | consumed tokens: 18926796800 | elapsed time per iteration (s): 2.25 | learning rate: 4.914E-05 | global batch size: 512 | lm loss: 2.018863E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.904 | TFLOPs: 23.46 | 63: iteration 18060/ 24424 | consumed samples: 9246720 | consumed tokens: 18937282560 | elapsed time per iteration (s): 2.23 | learning rate: 4.906E-05 | global batch size: 512 | lm loss: 2.013111E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.561 | TFLOPs: 23.63 | 63: iteration 18070/ 24424 | consumed samples: 9251840 | consumed tokens: 18947768320 | elapsed time per iteration (s): 2.33 | learning rate: 4.897E-05 | global batch size: 512 | lm loss: 1.991728E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.931 | TFLOPs: 22.64 | 63: iteration 18080/ 24424 | consumed samples: 9256960 | consumed tokens: 18958254080 | elapsed time per iteration (s): 2.23 | learning rate: 4.889E-05 | global batch size: 512 | lm loss: 2.040145E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.969 | TFLOPs: 23.67 | 63: iteration 18090/ 24424 | consumed samples: 9262080 | consumed tokens: 18968739840 | elapsed time per iteration (s): 2.34 | learning rate: 4.880E-05 | global batch size: 512 | lm loss: 2.019444E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 219.209 | TFLOPs: 22.57 | 63: iteration 18100/ 24424 | consumed samples: 9267200 | consumed tokens: 18979225600 | elapsed time per iteration (s): 2.23 | learning rate: 4.871E-05 | global batch size: 512 | lm loss: 2.019677E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.023 | TFLOPs: 23.68 | 63: iteration 18110/ 24424 | consumed samples: 9272320 | consumed tokens: 18989711360 | elapsed time per iteration (s): 2.36 | learning rate: 4.863E-05 | global batch size: 512 | lm loss: 2.017726E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 217.145 | TFLOPs: 22.35 | 63: iteration 18120/ 24424 | consumed samples: 9277440 | consumed tokens: 19000197120 | elapsed time per iteration (s): 2.23 | learning rate: 4.854E-05 | global batch size: 512 | lm loss: 2.026269E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.793 | TFLOPs: 23.66 | 63: iteration 18130/ 24424 | consumed samples: 9282560 | consumed tokens: 19010682880 | elapsed time per iteration (s): 2.26 | learning rate: 4.846E-05 | global batch size: 512 | lm loss: 1.996346E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.564 | TFLOPs: 23.32 | 63: iteration 18140/ 24424 | consumed samples: 9287680 | consumed tokens: 19021168640 | elapsed time per iteration (s): 2.23 | learning rate: 4.837E-05 | global batch size: 512 | lm loss: 2.015709E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.099 | TFLOPs: 23.69 | 63: iteration 18150/ 24424 | consumed samples: 9292800 | consumed tokens: 19031654400 | elapsed time per iteration (s): 2.25 | learning rate: 4.829E-05 | global batch size: 512 | lm loss: 2.027745E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.779 | TFLOPs: 23.45 | 63: iteration 18160/ 24424 | consumed samples: 9297920 | consumed tokens: 19042140160 | elapsed time per iteration (s): 2.24 | learning rate: 4.820E-05 | global batch size: 512 | lm loss: 2.011116E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.447 | TFLOPs: 23.52 | 63: iteration 18170/ 24424 | consumed samples: 9303040 | consumed tokens: 19052625920 | elapsed time per iteration (s): 2.23 | learning rate: 4.812E-05 | global batch size: 512 | lm loss: 2.001398E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.056 | TFLOPs: 23.68 | 63: iteration 18180/ 24424 | consumed samples: 9308160 | consumed tokens: 19063111680 | elapsed time per iteration (s): 2.23 | learning rate: 4.803E-05 | global batch size: 512 | lm loss: 2.004612E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.083 | TFLOPs: 23.69 | 63: iteration 18190/ 24424 | consumed samples: 9313280 | consumed tokens: 19073597440 | elapsed time per iteration (s): 2.23 | learning rate: 4.795E-05 | global batch size: 512 | lm loss: 2.017766E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.522 | TFLOPs: 23.63 | 63: iteration 18200/ 24424 | consumed samples: 9318400 | consumed tokens: 19084083200 | elapsed time per iteration (s): 2.24 | learning rate: 4.786E-05 | global batch size: 512 | lm loss: 2.015959E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.220 | TFLOPs: 23.49 | 63: iteration 18210/ 24424 | consumed samples: 9323520 | consumed tokens: 19094568960 | elapsed time per iteration (s): 2.26 | learning rate: 4.778E-05 | global batch size: 512 | lm loss: 1.998203E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.753 | TFLOPs: 23.34 | 63: iteration 18220/ 24424 | consumed samples: 9328640 | consumed tokens: 19105054720 | elapsed time per iteration (s): 2.30 | learning rate: 4.769E-05 | global batch size: 512 | lm loss: 2.004164E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.623 | TFLOPs: 22.92 | 63: iteration 18230/ 24424 | consumed samples: 9333760 | consumed tokens: 19115540480 | elapsed time per iteration (s): 2.24 | learning rate: 4.761E-05 | global batch size: 512 | lm loss: 2.017521E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.016 | TFLOPs: 23.58 | 63: iteration 18240/ 24424 | consumed samples: 9338880 | consumed tokens: 19126026240 | elapsed time per iteration (s): 2.23 | learning rate: 4.753E-05 | global batch size: 512 | lm loss: 2.007485E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.261 | TFLOPs: 23.60 | 63: iteration 18250/ 24424 | consumed samples: 9344000 | consumed tokens: 19136512000 | elapsed time per iteration (s): 3.10 | learning rate: 4.744E-05 | global batch size: 512 | lm loss: 2.008107E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 165.034 | TFLOPs: 16.99 | 63: iteration 18260/ 24424 | consumed samples: 9349120 | consumed tokens: 19146997760 | elapsed time per iteration (s): 2.23 | learning rate: 4.736E-05 | global batch size: 512 | lm loss: 2.000862E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.077 | TFLOPs: 23.69 | 63: iteration 18270/ 24424 | consumed samples: 9354240 | consumed tokens: 19157483520 | elapsed time per iteration (s): 2.24 | learning rate: 4.727E-05 | global batch size: 512 | lm loss: 2.013069E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.179 | TFLOPs: 23.49 | 63: iteration 18280/ 24424 | consumed samples: 9359360 | consumed tokens: 19167969280 | elapsed time per iteration (s): 2.24 | learning rate: 4.719E-05 | global batch size: 512 | lm loss: 2.013786E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.254 | TFLOPs: 23.50 | 63: iteration 18290/ 24424 | consumed samples: 9364480 | consumed tokens: 19178455040 | elapsed time per iteration (s): 2.24 | learning rate: 4.711E-05 | global batch size: 512 | lm loss: 1.991059E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.498 | TFLOPs: 23.52 | 63: iteration 18300/ 24424 | consumed samples: 9369600 | consumed tokens: 19188940800 | elapsed time per iteration (s): 2.23 | learning rate: 4.702E-05 | global batch size: 512 | lm loss: 1.983618E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.848 | TFLOPs: 23.66 | 63: iteration 18310/ 24424 | consumed samples: 9374720 | consumed tokens: 19199426560 | elapsed time per iteration (s): 2.27 | learning rate: 4.694E-05 | global batch size: 512 | lm loss: 2.025805E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.011 | TFLOPs: 23.27 | 63: iteration 18320/ 24424 | consumed samples: 9379840 | consumed tokens: 19209912320 | elapsed time per iteration (s): 2.23 | learning rate: 4.686E-05 | global batch size: 512 | lm loss: 1.989349E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.546 | TFLOPs: 23.63 | 63: iteration 18330/ 24424 | consumed samples: 9384960 | consumed tokens: 19220398080 | elapsed time per iteration (s): 2.24 | learning rate: 4.677E-05 | global batch size: 512 | lm loss: 1.998860E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.258 | TFLOPs: 23.50 | 63: iteration 18340/ 24424 | consumed samples: 9390080 | consumed tokens: 19230883840 | elapsed time per iteration (s): 2.24 | learning rate: 4.669E-05 | global batch size: 512 | lm loss: 2.016991E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.546 | TFLOPs: 23.53 | 63: iteration 18350/ 24424 | consumed samples: 9395200 | consumed tokens: 19241369600 | elapsed time per iteration (s): 2.23 | learning rate: 4.661E-05 | global batch size: 512 | lm loss: 2.005175E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.463 | TFLOPs: 23.62 | 63: iteration 18360/ 24424 | consumed samples: 9400320 | consumed tokens: 19251855360 | elapsed time per iteration (s): 2.23 | learning rate: 4.652E-05 | global batch size: 512 | lm loss: 1.995487E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.962 | TFLOPs: 23.67 | 63: iteration 18370/ 24424 | consumed samples: 9405440 | consumed tokens: 19262341120 | elapsed time per iteration (s): 2.23 | learning rate: 4.644E-05 | global batch size: 512 | lm loss: 2.020354E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.225 | TFLOPs: 23.60 | 63: iteration 18380/ 24424 | consumed samples: 9410560 | consumed tokens: 19272826880 | elapsed time per iteration (s): 2.23 | learning rate: 4.636E-05 | global batch size: 512 | lm loss: 2.003167E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.720 | TFLOPs: 23.65 | 63: iteration 18390/ 24424 | consumed samples: 9415680 | consumed tokens: 19283312640 | elapsed time per iteration (s): 2.23 | learning rate: 4.627E-05 | global batch size: 512 | lm loss: 2.015822E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.784 | TFLOPs: 23.66 | 63: iteration 18400/ 24424 | consumed samples: 9420800 | consumed tokens: 19293798400 | elapsed time per iteration (s): 2.29 | learning rate: 4.619E-05 | global batch size: 512 | lm loss: 2.004629E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.030 | TFLOPs: 23.06 | 63: iteration 18410/ 24424 | consumed samples: 9425920 | consumed tokens: 19304284160 | elapsed time per iteration (s): 2.29 | learning rate: 4.611E-05 | global batch size: 512 | lm loss: 2.013425E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.991 | TFLOPs: 23.06 | 63: iteration 18420/ 24424 | consumed samples: 9431040 | consumed tokens: 19314769920 | elapsed time per iteration (s): 2.27 | learning rate: 4.603E-05 | global batch size: 512 | lm loss: 2.005910E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.811 | TFLOPs: 23.25 | 63: iteration 18430/ 24424 | consumed samples: 9436160 | consumed tokens: 19325255680 | elapsed time per iteration (s): 4.23 | learning rate: 4.595E-05 | global batch size: 512 | lm loss: 2.022630E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 120.993 | TFLOPs: 12.46 | 63: iteration 18440/ 24424 | consumed samples: 9441280 | consumed tokens: 19335741440 | elapsed time per iteration (s): 2.25 | learning rate: 4.586E-05 | global batch size: 512 | lm loss: 1.981276E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.754 | TFLOPs: 23.45 | 63: iteration 18450/ 24424 | consumed samples: 9446400 | consumed tokens: 19346227200 | elapsed time per iteration (s): 2.25 | learning rate: 4.578E-05 | global batch size: 512 | lm loss: 2.013794E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.275 | TFLOPs: 23.40 | 63: iteration 18460/ 24424 | consumed samples: 9451520 | consumed tokens: 19356712960 | elapsed time per iteration (s): 2.26 | learning rate: 4.570E-05 | global batch size: 512 | lm loss: 2.019803E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.346 | TFLOPs: 23.30 | 63: iteration 18470/ 24424 | consumed samples: 9456640 | consumed tokens: 19367198720 | elapsed time per iteration (s): 2.23 | learning rate: 4.562E-05 | global batch size: 512 | lm loss: 2.031052E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.162 | TFLOPs: 23.59 | 63: iteration 18480/ 24424 | consumed samples: 9461760 | consumed tokens: 19377684480 | elapsed time per iteration (s): 2.27 | learning rate: 4.554E-05 | global batch size: 512 | lm loss: 1.999831E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.582 | TFLOPs: 23.22 | 63: iteration 18490/ 24424 | consumed samples: 9466880 | consumed tokens: 19388170240 | elapsed time per iteration (s): 2.29 | learning rate: 4.545E-05 | global batch size: 512 | lm loss: 2.023598E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.059 | TFLOPs: 23.07 | 63: iteration 18500/ 24424 | consumed samples: 9472000 | consumed tokens: 19398656000 | elapsed time per iteration (s): 2.26 | learning rate: 4.537E-05 | global batch size: 512 | lm loss: 1.997149E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.238 | TFLOPs: 23.29 | 63: iteration 18510/ 24424 | consumed samples: 9477120 | consumed tokens: 19409141760 | elapsed time per iteration (s): 2.29 | learning rate: 4.529E-05 | global batch size: 512 | lm loss: 2.004116E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.393 | TFLOPs: 23.00 | 63: iteration 18520/ 24424 | consumed samples: 9482240 | consumed tokens: 19419627520 | elapsed time per iteration (s): 3.76 | learning rate: 4.521E-05 | global batch size: 512 | lm loss: 2.011086E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 136.341 | TFLOPs: 14.04 | 63: iteration 18530/ 24424 | consumed samples: 9487360 | consumed tokens: 19430113280 | elapsed time per iteration (s): 2.24 | learning rate: 4.513E-05 | global batch size: 512 | lm loss: 2.010571E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.173 | TFLOPs: 23.49 | 63: iteration 18540/ 24424 | consumed samples: 9492480 | consumed tokens: 19440599040 | elapsed time per iteration (s): 2.25 | learning rate: 4.505E-05 | global batch size: 512 | lm loss: 2.002417E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.319 | TFLOPs: 23.40 | 63: iteration 18550/ 24424 | consumed samples: 9497600 | consumed tokens: 19451084800 | elapsed time per iteration (s): 2.55 | learning rate: 4.497E-05 | global batch size: 512 | lm loss: 2.010256E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 201.062 | TFLOPs: 20.70 | 63: iteration 18560/ 24424 | consumed samples: 9502720 | consumed tokens: 19461570560 | elapsed time per iteration (s): 2.26 | learning rate: 4.489E-05 | global batch size: 512 | lm loss: 2.005375E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.106 | TFLOPs: 23.28 | 63: iteration 18570/ 24424 | consumed samples: 9507840 | consumed tokens: 19472056320 | elapsed time per iteration (s): 2.27 | learning rate: 4.481E-05 | global batch size: 512 | lm loss: 2.017764E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.933 | TFLOPs: 23.26 | 63: iteration 18580/ 24424 | consumed samples: 9512960 | consumed tokens: 19482542080 | elapsed time per iteration (s): 2.37 | learning rate: 4.473E-05 | global batch size: 512 | lm loss: 2.014780E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 215.993 | TFLOPs: 22.24 | 63: iteration 18590/ 24424 | consumed samples: 9518080 | consumed tokens: 19493027840 | elapsed time per iteration (s): 2.24 | learning rate: 4.465E-05 | global batch size: 512 | lm loss: 1.997054E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.858 | TFLOPs: 23.56 | 63: iteration 18600/ 24424 | consumed samples: 9523200 | consumed tokens: 19503513600 | elapsed time per iteration (s): 2.31 | learning rate: 4.456E-05 | global batch size: 512 | lm loss: 1.998076E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.575 | TFLOPs: 22.81 | 63: iteration 18610/ 24424 | consumed samples: 9528320 | consumed tokens: 19513999360 | elapsed time per iteration (s): 2.23 | learning rate: 4.448E-05 | global batch size: 512 | lm loss: 2.008663E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.832 | TFLOPs: 23.66 | 63: iteration 18620/ 24424 | consumed samples: 9533440 | consumed tokens: 19524485120 | elapsed time per iteration (s): 2.28 | learning rate: 4.440E-05 | global batch size: 512 | lm loss: 1.999489E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.966 | TFLOPs: 23.16 | 63: iteration 18630/ 24424 | consumed samples: 9538560 | consumed tokens: 19534970880 | elapsed time per iteration (s): 2.27 | learning rate: 4.432E-05 | global batch size: 512 | lm loss: 2.003173E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.119 | TFLOPs: 23.17 | 63: iteration 18640/ 24424 | consumed samples: 9543680 | consumed tokens: 19545456640 | elapsed time per iteration (s): 2.23 | learning rate: 4.424E-05 | global batch size: 512 | lm loss: 2.021265E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.130 | TFLOPs: 23.59 | 63: iteration 18650/ 24424 | consumed samples: 9548800 | consumed tokens: 19555942400 | elapsed time per iteration (s): 2.24 | learning rate: 4.416E-05 | global batch size: 512 | lm loss: 2.004093E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.490 | TFLOPs: 23.52 | 63: iteration 18660/ 24424 | consumed samples: 9553920 | consumed tokens: 19566428160 | elapsed time per iteration (s): 2.26 | learning rate: 4.409E-05 | global batch size: 512 | lm loss: 2.006105E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.865 | TFLOPs: 23.35 | 63: iteration 18670/ 24424 | consumed samples: 9559040 | consumed tokens: 19576913920 | elapsed time per iteration (s): 2.28 | learning rate: 4.401E-05 | global batch size: 512 | lm loss: 1.999809E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.455 | TFLOPs: 23.11 | 63: iteration 18680/ 24424 | consumed samples: 9564160 | consumed tokens: 19587399680 | elapsed time per iteration (s): 2.23 | learning rate: 4.393E-05 | global batch size: 512 | lm loss: 2.013237E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.287 | TFLOPs: 23.60 | 63: iteration 18690/ 24424 | consumed samples: 9569280 | consumed tokens: 19597885440 | elapsed time per iteration (s): 2.26 | learning rate: 4.385E-05 | global batch size: 512 | lm loss: 2.018050E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.922 | TFLOPs: 23.36 | 63: iteration 18700/ 24424 | consumed samples: 9574400 | consumed tokens: 19608371200 | elapsed time per iteration (s): 2.39 | learning rate: 4.377E-05 | global batch size: 512 | lm loss: 1.994242E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 214.174 | TFLOPs: 22.05 | 63: iteration 18710/ 24424 | consumed samples: 9579520 | consumed tokens: 19618856960 | elapsed time per iteration (s): 2.68 | learning rate: 4.369E-05 | global batch size: 512 | lm loss: 1.979877E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.784 | TFLOPs: 19.64 | 63: iteration 18720/ 24424 | consumed samples: 9584640 | consumed tokens: 19629342720 | elapsed time per iteration (s): 2.24 | learning rate: 4.361E-05 | global batch size: 512 | lm loss: 1.993265E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.271 | TFLOPs: 23.50 | 63: iteration 18730/ 24424 | consumed samples: 9589760 | consumed tokens: 19639828480 | elapsed time per iteration (s): 3.92 | learning rate: 4.353E-05 | global batch size: 512 | lm loss: 2.012754E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 130.557 | TFLOPs: 13.44 | 63: iteration 18740/ 24424 | consumed samples: 9594880 | consumed tokens: 19650314240 | elapsed time per iteration (s): 2.25 | learning rate: 4.345E-05 | global batch size: 512 | lm loss: 2.014122E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.623 | TFLOPs: 23.43 | 63: iteration 18750/ 24424 | consumed samples: 9600000 | consumed tokens: 19660800000 | elapsed time per iteration (s): 2.23 | learning rate: 4.337E-05 | global batch size: 512 | lm loss: 2.014795E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.711 | TFLOPs: 23.65 | 63: iteration 18760/ 24424 | consumed samples: 9605120 | consumed tokens: 19671285760 | elapsed time per iteration (s): 2.26 | learning rate: 4.329E-05 | global batch size: 512 | lm loss: 1.995701E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.779 | TFLOPs: 23.35 | 63: iteration 18770/ 24424 | consumed samples: 9610240 | consumed tokens: 19681771520 | elapsed time per iteration (s): 2.23 | learning rate: 4.322E-05 | global batch size: 512 | lm loss: 1.973740E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.976 | TFLOPs: 23.67 | 63: iteration 18780/ 24424 | consumed samples: 9615360 | consumed tokens: 19692257280 | elapsed time per iteration (s): 2.24 | learning rate: 4.314E-05 | global batch size: 512 | lm loss: 1.979484E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.904 | TFLOPs: 23.56 | 63: iteration 18790/ 24424 | consumed samples: 9620480 | consumed tokens: 19702743040 | elapsed time per iteration (s): 2.24 | learning rate: 4.306E-05 | global batch size: 512 | lm loss: 2.015467E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.516 | TFLOPs: 23.52 | 63: iteration 18800/ 24424 | consumed samples: 9625600 | consumed tokens: 19713228800 | elapsed time per iteration (s): 2.23 | learning rate: 4.298E-05 | global batch size: 512 | lm loss: 2.005776E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.004 | TFLOPs: 23.68 | 63: iteration 18810/ 24424 | consumed samples: 9630720 | consumed tokens: 19723714560 | elapsed time per iteration (s): 2.24 | learning rate: 4.290E-05 | global batch size: 512 | lm loss: 2.013317E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.335 | TFLOPs: 23.51 | 63: iteration 18820/ 24424 | consumed samples: 9635840 | consumed tokens: 19734200320 | elapsed time per iteration (s): 2.23 | learning rate: 4.283E-05 | global batch size: 512 | lm loss: 1.991782E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.901 | TFLOPs: 23.67 | 63: iteration 18830/ 24424 | consumed samples: 9640960 | consumed tokens: 19744686080 | elapsed time per iteration (s): 2.28 | learning rate: 4.275E-05 | global batch size: 512 | lm loss: 1.994040E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.088 | TFLOPs: 23.07 | 63: iteration 18840/ 24424 | consumed samples: 9646080 | consumed tokens: 19755171840 | elapsed time per iteration (s): 2.26 | learning rate: 4.267E-05 | global batch size: 512 | lm loss: 1.992691E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.393 | TFLOPs: 23.31 | 63: iteration 18850/ 24424 | consumed samples: 9651200 | consumed tokens: 19765657600 | elapsed time per iteration (s): 2.25 | learning rate: 4.259E-05 | global batch size: 512 | lm loss: 2.006405E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.259 | TFLOPs: 23.40 | 63: iteration 18860/ 24424 | consumed samples: 9656320 | consumed tokens: 19776143360 | elapsed time per iteration (s): 2.36 | learning rate: 4.252E-05 | global batch size: 512 | lm loss: 2.033025E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 216.744 | TFLOPs: 22.31 | 63: iteration 18870/ 24424 | consumed samples: 9661440 | consumed tokens: 19786629120 | elapsed time per iteration (s): 2.29 | learning rate: 4.244E-05 | global batch size: 512 | lm loss: 2.008755E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.911 | TFLOPs: 23.05 | 63: iteration 18880/ 24424 | consumed samples: 9666560 | consumed tokens: 19797114880 | elapsed time per iteration (s): 2.26 | learning rate: 4.236E-05 | global batch size: 512 | lm loss: 2.000047E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.697 | TFLOPs: 23.34 | 63: iteration 18890/ 24424 | consumed samples: 9671680 | consumed tokens: 19807600640 | elapsed time per iteration (s): 2.23 | learning rate: 4.228E-05 | global batch size: 512 | lm loss: 2.000420E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.257 | TFLOPs: 23.60 | 63: iteration 18900/ 24424 | consumed samples: 9676800 | consumed tokens: 19818086400 | elapsed time per iteration (s): 2.24 | learning rate: 4.221E-05 | global batch size: 512 | lm loss: 1.991277E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.730 | TFLOPs: 23.55 | 63: iteration 18910/ 24424 | consumed samples: 9681920 | consumed tokens: 19828572160 | elapsed time per iteration (s): 2.24 | learning rate: 4.213E-05 | global batch size: 512 | lm loss: 1.979496E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.479 | TFLOPs: 23.52 | 63: iteration 18920/ 24424 | consumed samples: 9687040 | consumed tokens: 19839057920 | elapsed time per iteration (s): 2.25 | learning rate: 4.205E-05 | global batch size: 512 | lm loss: 2.003296E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.429 | TFLOPs: 23.41 | 63: iteration 18930/ 24424 | consumed samples: 9692160 | consumed tokens: 19849543680 | elapsed time per iteration (s): 2.23 | learning rate: 4.198E-05 | global batch size: 512 | lm loss: 2.006570E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.206 | TFLOPs: 23.60 | 63: iteration 18940/ 24424 | consumed samples: 9697280 | consumed tokens: 19860029440 | elapsed time per iteration (s): 2.24 | learning rate: 4.190E-05 | global batch size: 512 | lm loss: 2.006809E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.909 | TFLOPs: 23.57 | 63: iteration 18950/ 24424 | consumed samples: 9702400 | consumed tokens: 19870515200 | elapsed time per iteration (s): 2.27 | learning rate: 4.182E-05 | global batch size: 512 | lm loss: 1.998828E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.501 | TFLOPs: 23.21 | 63: iteration 18960/ 24424 | consumed samples: 9707520 | consumed tokens: 19881000960 | elapsed time per iteration (s): 2.26 | learning rate: 4.175E-05 | global batch size: 512 | lm loss: 2.010436E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.415 | TFLOPs: 23.31 | 63: iteration 18970/ 24424 | consumed samples: 9712640 | consumed tokens: 19891486720 | elapsed time per iteration (s): 2.25 | learning rate: 4.167E-05 | global batch size: 512 | lm loss: 1.994841E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.270 | TFLOPs: 23.40 | 63: iteration 18980/ 24424 | consumed samples: 9717760 | consumed tokens: 19901972480 | elapsed time per iteration (s): 2.23 | learning rate: 4.160E-05 | global batch size: 512 | lm loss: 2.015051E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.928 | TFLOPs: 23.67 | 63: iteration 18990/ 24424 | consumed samples: 9722880 | consumed tokens: 19912458240 | elapsed time per iteration (s): 2.23 | learning rate: 4.152E-05 | global batch size: 512 | lm loss: 1.996339E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.543 | TFLOPs: 23.63 | 63: iteration 19000/ 24424 | consumed samples: 9728000 | consumed tokens: 19922944000 | elapsed time per iteration (s): 2.31 | learning rate: 4.144E-05 | global batch size: 512 | lm loss: 2.005632E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.101 | TFLOPs: 22.86 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 19000 | lm loss value: 2.014602E+00 | lm loss PPL: 7.497744E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 19000 to checkpoints_3b9 0: [2022-11-26 06:01:54,195] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step19000 is begin to save! 0: [2022-11-26 06:01:54,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_01-model_00-model_states.pt... 32: [2022-11-26 06:01:54,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_21-model_00-model_states.pt... 32: [2022-11-26 06:01:54,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_21-model_00-model_states.pt. 32: [2022-11-26 06:01:54,486] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_22-model_00-model_states.pt... 0: [2022-11-26 06:01:54,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_01-model_00-model_states.pt. 0: [2022-11-26 06:01:54,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_03-model_00-model_states.pt... 32: [2022-11-26 06:01:54,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_22-model_00-model_states.pt. 32: [2022-11-26 06:01:54,727] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_23-model_00-model_states.pt... 0: [2022-11-26 06:01:54,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_03-model_00-model_states.pt. 0: [2022-11-26 06:01:54,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_04-model_00-model_states.pt... 32: [2022-11-26 06:01:54,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_23-model_00-model_states.pt. 32: [2022-11-26 06:01:54,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_24-model_00-model_states.pt... 0: [2022-11-26 06:01:55,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_04-model_00-model_states.pt. 0: [2022-11-26 06:01:55,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_05-model_00-model_states.pt... 32: [2022-11-26 06:01:55,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_24-model_00-model_states.pt. 32: [2022-11-26 06:01:55,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_25-model_00-model_states.pt... 0: [2022-11-26 06:01:55,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_05-model_00-model_states.pt. 0: [2022-11-26 06:01:55,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_06-model_00-model_states.pt... 32: [2022-11-26 06:01:55,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_25-model_00-model_states.pt. 32: [2022-11-26 06:01:55,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_26-model_00-model_states.pt... 0: [2022-11-26 06:01:55,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_06-model_00-model_states.pt. 0: [2022-11-26 06:01:55,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_07-model_00-model_states.pt... 32: [2022-11-26 06:01:55,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_26-model_00-model_states.pt. 32: [2022-11-26 06:01:55,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_27-model_00-model_states.pt... 0: [2022-11-26 06:01:55,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_07-model_00-model_states.pt. 0: [2022-11-26 06:01:55,746] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_08-model_00-model_states.pt... 32: [2022-11-26 06:01:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_27-model_00-model_states.pt. 32: [2022-11-26 06:01:55,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_28-model_00-model_states.pt... 0: [2022-11-26 06:01:55,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_08-model_00-model_states.pt. 0: [2022-11-26 06:01:55,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_09-model_00-model_states.pt... 32: [2022-11-26 06:01:56,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_28-model_00-model_states.pt. 32: [2022-11-26 06:01:56,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_29-model_00-model_states.pt... 0: [2022-11-26 06:01:56,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_09-model_00-model_states.pt. 0: [2022-11-26 06:01:56,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_10-model_00-model_states.pt... 32: [2022-11-26 06:01:56,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_29-model_00-model_states.pt. 32: [2022-11-26 06:01:56,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_30-model_00-model_states.pt... 0: [2022-11-26 06:01:56,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_10-model_00-model_states.pt. 0: [2022-11-26 06:01:56,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_11-model_00-model_states.pt... 32: [2022-11-26 06:01:56,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_30-model_00-model_states.pt. 32: [2022-11-26 06:01:56,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_31-model_00-model_states.pt... 0: [2022-11-26 06:01:56,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_11-model_00-model_states.pt. 0: [2022-11-26 06:01:56,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_12-model_00-model_states.pt... 32: [2022-11-26 06:01:56,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_31-model_00-model_states.pt. 32: [2022-11-26 06:01:56,815] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_32-model_00-model_states.pt... 0: [2022-11-26 06:01:56,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_12-model_00-model_states.pt. 0: [2022-11-26 06:01:56,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_13-model_00-model_states.pt... 32: [2022-11-26 06:01:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_32-model_00-model_states.pt. 32: [2022-11-26 06:01:57,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_33-model_00-model_states.pt... 0: [2022-11-26 06:01:57,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_13-model_00-model_states.pt. 0: [2022-11-26 06:01:57,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_14-model_00-model_states.pt... 32: [2022-11-26 06:01:57,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_33-model_00-model_states.pt. 32: [2022-11-26 06:01:57,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_34-model_00-model_states.pt... 0: [2022-11-26 06:01:57,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_14-model_00-model_states.pt. 0: [2022-11-26 06:01:57,302] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_15-model_00-model_states.pt... 32: [2022-11-26 06:01:57,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_34-model_00-model_states.pt. 32: [2022-11-26 06:01:57,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_35-model_00-model_states.pt... 0: [2022-11-26 06:01:57,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_15-model_00-model_states.pt. 0: [2022-11-26 06:01:57,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_16-model_00-model_states.pt... 32: [2022-11-26 06:01:57,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_35-model_00-model_states.pt. 32: [2022-11-26 06:01:57,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_36-model_00-model_states.pt... 0: [2022-11-26 06:01:57,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_16-model_00-model_states.pt. 0: [2022-11-26 06:01:57,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_17-model_00-model_states.pt... 0: [2022-11-26 06:01:57,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_17-model_00-model_states.pt. 0: [2022-11-26 06:01:57,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_18-model_00-model_states.pt... 32: [2022-11-26 06:01:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_36-model_00-model_states.pt. 32: [2022-11-26 06:01:57,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_37-model_00-model_states.pt... 0: [2022-11-26 06:01:58,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_18-model_00-model_states.pt. 0: [2022-11-26 06:01:58,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_19-model_00-model_states.pt... 32: [2022-11-26 06:01:58,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_37-model_00-model_states.pt. 32: [2022-11-26 06:01:58,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_38-model_00-model_states.pt... 0: [2022-11-26 06:01:58,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_19-model_00-model_states.pt. 0: [2022-11-26 06:01:58,386] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_20-model_00-model_states.pt... 32: [2022-11-26 06:01:58,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_38-model_00-model_states.pt. 32: [2022-11-26 06:01:58,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/layer_40-model_00-model_states.pt... 32: [2022-11-26 06:01:58,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_40-model_00-model_states.pt. 32: [2022-11-26 06:01:58,410] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/mp_rank_01_model_states.pt... 32: [2022-11-26 06:01:58,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/mp_rank_01_model_states.pt. 0: [2022-11-26 06:01:58,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/layer_20-model_00-model_states.pt. 0: [2022-11-26 06:01:58,598] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step19000/mp_rank_00_model_states.pt 0: [2022-11-26 06:01:58,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/mp_rank_00_model_states.pt... 0: [2022-11-26 06:01:58,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/mp_rank_00_model_states.pt. 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 40: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 44: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 16: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 54: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 21: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:01:58,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step19000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:01:58,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:58,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:58,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:58,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:58,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:58,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:58,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:58,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:01:58,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 06:01:58,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:58,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:58,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:58,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:58,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:58,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:58,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:58,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:01:58,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:58,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 0: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:58,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:58,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:58,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:58,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:01:58,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:58,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:58,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:01:58,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 55: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 20: [2022-11-26 06:01:58,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 55: [2022-11-26 06:01:58,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 20: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:58,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 06:01:58,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:58,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 51: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 21: [2022-11-26 06:01:58,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 51: [2022-11-26 06:01:58,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 21: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:01:58,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 06:01:58,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 6: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 45: [2022-11-26 06:01:58,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 6: [2022-11-26 06:01:58,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 45: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:58,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:58,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:58,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:58,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:58,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:58,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:58,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:58,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 62: [2022-11-26 06:01:58,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:58,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 14: [2022-11-26 06:01:58,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 62: [2022-11-26 06:01:58,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:58,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:58,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:01:58,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:58,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:58,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:58,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:58,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:58,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 25: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 42: [2022-11-26 06:01:58,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 25: [2022-11-26 06:01:58,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 42: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:58,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:58,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:01:58,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:01:58,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:01:58,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:01:58,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:58,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:58,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:58,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:58,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 15: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 32: [2022-11-26 06:01:58,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:58,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:58,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:58,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:01:58,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:58,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:58,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:01:58,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:01:58,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:58,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:58,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:58,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:58,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:58,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:58,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:58,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:58,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:58,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:58,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:01:58,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:58,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:58,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:58,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 06:01:58,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:58,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:01:58,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:58,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 17: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 35: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:58,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:58,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 06:01:58,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:58,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:58,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:58,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:58,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 7: [2022-11-26 06:01:58,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:01:58,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 45: [2022-11-26 06:01:58,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:01:58,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 42: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 19: [2022-11-26 06:01:58,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:58,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:58,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:58,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:58,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:58,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:58,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:58,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:58,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:58,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:58,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:58,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:58,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:58,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:58,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:58,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 6: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:58,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 37: [2022-11-26 06:01:58,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 6: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:01:58,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 06:01:58,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:58,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:58,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:58,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:58,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:58,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 29: [2022-11-26 06:01:58,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:01:58,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 38: [2022-11-26 06:01:58,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:58,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:58,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:58,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:58,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:58,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:58,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 06:01:58,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:58,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:58,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:58,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:58,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 7: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:01:58,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 37: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:01:58,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 51: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:01:58,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:58,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:58,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:58,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:58,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:58,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:58,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:01:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 41: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 2: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 41: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 2: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:58,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:58,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:58,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:01:58,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:58,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 06:01:58,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:58,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 43: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 0: [2022-11-26 06:01:58,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:58,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:58,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 29: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 43: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:58,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:58,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:58,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:58,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:58,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:58,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:58,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:01:58,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 23: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 40: [2022-11-26 06:01:58,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 06:01:58,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 55: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 23: [2022-11-26 06:01:58,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 40: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:58,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:58,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 23: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:58,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:58,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:58,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:58,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 14: [2022-11-26 06:01:58,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 33: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:58,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:58,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 58: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 10: [2022-11-26 06:01:58,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:58,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:58,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 10: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:58,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:01:58,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:58,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:58,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:58,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:58,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:58,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:58,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:58,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:58,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:58,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:58,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:58,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:58,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:01:58,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:58,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:58,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:58,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 12: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:58,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:58,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 12: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 0: [2022-11-26 06:01:58,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 40: [2022-11-26 06:01:58,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:58,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:01:58,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:01:58,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:58,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:58,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:58,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:58,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:58,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:58,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:58,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:58,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 38: [2022-11-26 06:01:58,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 23: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:58,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:58,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:58,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:58,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:58,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:58,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:58,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:58,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:58,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:58,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:58,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:01:58,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:58,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 06:01:58,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:58,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:58,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:58,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:58,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:01:58,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:58,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:58,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:58,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:58,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:58,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:58,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 06:01:58,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:58,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:58,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:58,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 27: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 47: [2022-11-26 06:01:58,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 06:01:58,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 27: [2022-11-26 06:01:58,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 47: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:58,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 1: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 47: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:58,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:58,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:58,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:58,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:58,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:58,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:58,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:58,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:58,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:58,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:58,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:58,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:58,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:01:58,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:01:58,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:01:58,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 06:01:58,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 06:01:58,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:58,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:58,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 06:01:58,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:58,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:58,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:01:58,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:58,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:58,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:58,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:58,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:58,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:58,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:58,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:01:58,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:58,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:58,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:58,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:58,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:58,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:01:58,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:58,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:58,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:58,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 06:01:58,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:58,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:01:58,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 06:01:58,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:58,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:01:58,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:58,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:58,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:58,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 06:01:58,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:58,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:58,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:58,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 62: [2022-11-26 06:01:58,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:58,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 06:01:58,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:58,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:58,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:58,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:58,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:58,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 06:01:58,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:58,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:01:58,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:58,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:58,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:58,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:58,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:58,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 62: [2022-11-26 06:01:58,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 30: [2022-11-26 06:01:58,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 62: [2022-11-26 06:01:58,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 06:01:58,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:58,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:58,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 36: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:01:58,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:58,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:58,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:58,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 06:01:58,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:58,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:01:58,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:58,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:58,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:58,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:58,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:58,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 7: [2022-11-26 06:01:58,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:01:58,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:58,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 06:01:58,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:58,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:01:58,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:58,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:58,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:58,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 06:01:58,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:58,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:58,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:58,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:58,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:58,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:58,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:01:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 06:01:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:58,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:58,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:58,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:58,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:01:58,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:58,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:58,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:58,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:58,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:58,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:58,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 06:01:58,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:01:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:01:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 06:01:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 62: [2022-11-26 06:01:58,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:58,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 06:01:58,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:58,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:58,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:58,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:58,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:01:58,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:58,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:58,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:01:58,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 06:01:58,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:58,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:58,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:58,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:58,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:01:58,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:58,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 55: [2022-11-26 06:01:58,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:58,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:58,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:59,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:59,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:59,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:59,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:59,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 06:01:59,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:59,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:59,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:59,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:59,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:59,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:59,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:59,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:59,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:59,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:59,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:59,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:59,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:59,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:59,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:59,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:59,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:59,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 06:01:59,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:59,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:01:59,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:59,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:59,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:59,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:59,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:59,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:59,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:59,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:59,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:01:59,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:59,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:59,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:59,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 06:01:59,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:59,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:01:59,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:59,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:59,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:01:59,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 06:01:59,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:59,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:59,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 06:01:59,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:59,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:59,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:59,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:59,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:59,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:59,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:59,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:59,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:59,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:59,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 54: [2022-11-26 06:01:59,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 10: [2022-11-26 06:01:59,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:59,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:59,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 06:01:59,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:59,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:59,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:59,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:59,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:59,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:01:59,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 58: [2022-11-26 06:01:59,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:59,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:59,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:59,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:59,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:59,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:59,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:59,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:59,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:59,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:01:59,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 06:01:59,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:59,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:59,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:59,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:59,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 9: [2022-11-26 06:01:59,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 52: [2022-11-26 06:01:59,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:59,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:59,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:59,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 36: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 12: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:59,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:59,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:59,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:59,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:59,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 06:01:59,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:59,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:59,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:59,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:59,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:59,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:59,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:59,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:59,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:59,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:59,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:59,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:59,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:01:59,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 06:01:59,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:59,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:59,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:59,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:59,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:59,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:59,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:59,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:59,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:59,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:01:59,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:01:59,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:01:59,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 30: [2022-11-26 06:01:59,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 43: [2022-11-26 06:01:59,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 06:01:59,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:59,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:59,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 06:01:59,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:59,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:59,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:59,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:59,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:59,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:59,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:59,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:59,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:59,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:01:59,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:59,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:59,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:01:59,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 06:01:59,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:59,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:59,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:59,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:59,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:59,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 06:01:59,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:59,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:01:59,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 06:01:59,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:59,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:59,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:59,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:59,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:59,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:59,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:01:59,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 06:01:59,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:59,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:01:59,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:59,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:59,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:59,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:59,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:59,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:59,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:59,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:59,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 06:01:59,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:59,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:01:59,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:59,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:59,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:59,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:59,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:59,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:59,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:59,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:59,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:59,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:59,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:59,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:59,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:59,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:59,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:59,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:59,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 57: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 31: [2022-11-26 06:01:59,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 57: [2022-11-26 06:01:59,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:59,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:59,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:59,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:59,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:59,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 06:01:59,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:59,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:59,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:59,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:59,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:59,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 62: [2022-11-26 06:01:59,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:59,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 1: [2022-11-26 06:01:59,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 62: [2022-11-26 06:01:59,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:59,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:59,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:59,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:59,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:59,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:59,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:59,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:01:59,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:59,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:59,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:59,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:59,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:59,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:59,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:59,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:59,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:59,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:59,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:59,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:01:59,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:59,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:59,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:01:59,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:59,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:59,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 27: [2022-11-26 06:01:59,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:59,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 06:01:59,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:59,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:59,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:59,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:59,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:59,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:59,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:59,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 06:01:59,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:59,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:01:59,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 06:01:59,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 54: [2022-11-26 06:01:59,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 3: [2022-11-26 06:01:59,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 06:01:59,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:59,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 06:01:59,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:59,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:01:59,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 06:01:59,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 12: [2022-11-26 06:01:59,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:59,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:59,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:59,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:59,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:59,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:59,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:59,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:59,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:59,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:59,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:59,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:59,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 58: [2022-11-26 06:01:59,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:59,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 06:01:59,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:59,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:59,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:59,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:59,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:59,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 06:01:59,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:59,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 06:01:59,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:59,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:01:59,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:59,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:59,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:59,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:59,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:59,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:59,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:01:59,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:59,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:59,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:01:59,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:59,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:59,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:59,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:59,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:59,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:01:59,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 06:01:59,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:59,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:59,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:59,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:59,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:01:59,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:59,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:59,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:59,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:59,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:59,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 06:01:59,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:59,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:01:59,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:59,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:59,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:59,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:59,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:59,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:59,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 06:01:59,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:59,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:59,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:59,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:59,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:59,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:59,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 62: [2022-11-26 06:01:59,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:59,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 06:01:59,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:59,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:59,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:59,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:59,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:59,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 06:01:59,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:59,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:59,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:59,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 21: [2022-11-26 06:01:59,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:59,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:59,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:59,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:59,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:59,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:59,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:59,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 06:01:59,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:59,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:59,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:01:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 55: [2022-11-26 06:01:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 0: [2022-11-26 06:01:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 55: [2022-11-26 06:01:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 0: [2022-11-26 06:01:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:59,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:01:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:59,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:59,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 06:01:59,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:59,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:59,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:59,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:59,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:01:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:01:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:59,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:59,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:59,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:59,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:59,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 06:01:59,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:59,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:59,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:59,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:59,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:59,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:59,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:01:59,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:59,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:59,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:59,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:59,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:59,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:59,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:59,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:59,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:01:59,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 06:01:59,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:59,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:59,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:59,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 12: [2022-11-26 06:01:59,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:01:59,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:59,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:59,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:01:59,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 06:01:59,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:01:59,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:59,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:59,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 2: [2022-11-26 06:01:59,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 54: [2022-11-26 06:01:59,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 2: [2022-11-26 06:01:59,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 54: [2022-11-26 06:01:59,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:59,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:59,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:01:59,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:59,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:59,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:01:59,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:59,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:59,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:59,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:59,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:59,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:01:59,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:59,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:59,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:59,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:59,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 58: [2022-11-26 06:01:59,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 4: [2022-11-26 06:01:59,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 58: [2022-11-26 06:01:59,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 4: [2022-11-26 06:01:59,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 58: [2022-11-26 06:01:59,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:59,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:59,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:59,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:59,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:59,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:59,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:59,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:59,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:59,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 06:01:59,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 19: [2022-11-26 06:01:59,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:59,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:59,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:59,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:59,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:59,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:59,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:59,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 06:01:59,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:59,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:59,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:59,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:59,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:01:59,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:59,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:59,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:59,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 06:01:59,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 39: [2022-11-26 06:01:59,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:59,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:59,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:59,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:59,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:59,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:59,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:59,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:59,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:59,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:01:59,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:59,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:59,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 7: [2022-11-26 06:01:59,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 50: [2022-11-26 06:01:59,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 7: [2022-11-26 06:01:59,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 50: [2022-11-26 06:01:59,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:59,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:59,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:59,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:59,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:59,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:59,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:59,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:59,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:01:59,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:59,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:59,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:59,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 28: [2022-11-26 06:01:59,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 43: [2022-11-26 06:01:59,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:59,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:59,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:59,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:59,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:59,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:59,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:01:59,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:59,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 62: [2022-11-26 06:01:59,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:59,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 06:01:59,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:59,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:01:59,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 06:01:59,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:59,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:59,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:59,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 60: [2022-11-26 06:01:59,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:59,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:59,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:59,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:59,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:59,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:59,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:59,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:59,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:59,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:59,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:59,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:59,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:01:59,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:01:59,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 06:01:59,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:59,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:59,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:59,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:59,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:59,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 14: [2022-11-26 06:01:59,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:01:59,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:01:59,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 06:01:59,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: [2022-11-26 06:01:59,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 06:01:59,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 1: [2022-11-26 06:01:59,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:01:59,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:59,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 11: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:59,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 25: [2022-11-26 06:01:59,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 06:01:59,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 23: [2022-11-26 06:01:59,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:01:59,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 06:01:59,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 45: [2022-11-26 06:01:59,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:01:59,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 06:01:59,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:59,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:59,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:01:59,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 29: [2022-11-26 06:01:59,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 15: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:01:59,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 06:01:59,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:59,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:59,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:59,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 2: [2022-11-26 06:01:59,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:01:59,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 06:01:59,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:59,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:01:59,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 06:01:59,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 10: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:01:59,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 24: [2022-11-26 06:01:59,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 8: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 34: [2022-11-26 06:01:59,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:59,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:59,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:59,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:59,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 06:01:59,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:01:59,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 06:01:59,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 35: [2022-11-26 06:01:59,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:01:59,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 06:01:59,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 28: [2022-11-26 06:01:59,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:01:59,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 06:01:59,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 18: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:01:59,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 33: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:01:59,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:01:59,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 32: [2022-11-26 06:01:59,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 59: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 40: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 7: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 40: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 58: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 7: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 58: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 7: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 56: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 27: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:01:59,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 12: [2022-11-26 06:01:59,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 47: [2022-11-26 06:01:59,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 12: [2022-11-26 06:01:59,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 51: [2022-11-26 06:01:59,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 31: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:01:59,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 16: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:01:59,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 9: [2022-11-26 06:01:59,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:01:59,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 46: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:01:59,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 61: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:59,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 53: [2022-11-26 06:01:59,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 53: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 57: [2022-11-26 06:01:59,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 06:01:59,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 12: [2022-11-26 06:01:59,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:01:59,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 41: [2022-11-26 06:01:59,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 12: [2022-11-26 06:01:59,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 19: [2022-11-26 06:01:59,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 12: [2022-11-26 06:01:59,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:59,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 19: [2022-11-26 06:01:59,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 41: [2022-11-26 06:01:59,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 20: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:01:59,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 42: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:01:59,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 39: [2022-11-26 06:01:59,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 42: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 48: [2022-11-26 06:01:59,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 37: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 37: [2022-11-26 06:01:59,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 34: [2022-11-26 06:01:59,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 50: [2022-11-26 06:01:59,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 4: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 50: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 4: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 22: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 40: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 4: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 22: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 40: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 22: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 40: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 3: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 52: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 52: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 47: [2022-11-26 06:01:59,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 52: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 3: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 47: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 43: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 13: [2022-11-26 06:01:59,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 43: [2022-11-26 06:01:59,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 13: [2022-11-26 06:01:59,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 17: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:01:59,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 30: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:01:59,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 06:01:59,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 38: [2022-11-26 06:01:59,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:01:59,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 06:01:59,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 55: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:01:59,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 54: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 21: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 54: [2022-11-26 06:01:59,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:59,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:59,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 54: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 21: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 44: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:01:59,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 06:01:59,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 63: [2022-11-26 06:01:59,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:01:59,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 06:01:59,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 8: [2022-11-26 06:01:59,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:01:59,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 06:01:59,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:59,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:01:59,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:59,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 6: [2022-11-26 06:01:59,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:01:59,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 06:01:59,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:59,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:59,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:01:59,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 06:01:59,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 5: [2022-11-26 06:01:59,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:01:59,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 06:01:59,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 36: [2022-11-26 06:01:59,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 06:01:59,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:59,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:59,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 49: [2022-11-26 06:01:59,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:01:59,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 06:01:59,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 26: [2022-11-26 06:01:59,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:01:59,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step19000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 06:01:59,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! 0: successfully saved checkpoint at iteration 19000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5070.60 63: iteration 19010/ 24424 | consumed samples: 9733120 | consumed tokens: 19933429760 | elapsed time per iteration (s): 3.70 | learning rate: 4.137E-05 | global batch size: 512 | lm loss: 1.980933E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 138.450 | TFLOPs: 14.25 | 63: iteration 19020/ 24424 | consumed samples: 9738240 | consumed tokens: 19943915520 | elapsed time per iteration (s): 2.24 | learning rate: 4.129E-05 | global batch size: 512 | lm loss: 1.987211E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.388 | TFLOPs: 23.51 | 63: iteration 19030/ 24424 | consumed samples: 9743360 | consumed tokens: 19954401280 | elapsed time per iteration (s): 2.26 | learning rate: 4.122E-05 | global batch size: 512 | lm loss: 2.005577E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.615 | TFLOPs: 23.33 | 63: iteration 19040/ 24424 | consumed samples: 9748480 | consumed tokens: 19964887040 | elapsed time per iteration (s): 2.26 | learning rate: 4.114E-05 | global batch size: 512 | lm loss: 1.988917E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.176 | TFLOPs: 23.28 | 63: iteration 19050/ 24424 | consumed samples: 9753600 | consumed tokens: 19975372800 | elapsed time per iteration (s): 2.24 | learning rate: 4.107E-05 | global batch size: 512 | lm loss: 1.990221E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.167 | TFLOPs: 23.49 | 63: iteration 19060/ 24424 | consumed samples: 9758720 | consumed tokens: 19985858560 | elapsed time per iteration (s): 2.25 | learning rate: 4.099E-05 | global batch size: 512 | lm loss: 2.008803E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.561 | TFLOPs: 23.43 | 63: iteration 19070/ 24424 | consumed samples: 9763840 | consumed tokens: 19996344320 | elapsed time per iteration (s): 2.24 | learning rate: 4.092E-05 | global batch size: 512 | lm loss: 2.019096E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.064 | TFLOPs: 23.58 | 63: iteration 19080/ 24424 | consumed samples: 9768960 | consumed tokens: 20006830080 | elapsed time per iteration (s): 2.26 | learning rate: 4.084E-05 | global batch size: 512 | lm loss: 2.007838E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.966 | TFLOPs: 23.37 | 63: iteration 19090/ 24424 | consumed samples: 9774080 | consumed tokens: 20017315840 | elapsed time per iteration (s): 2.24 | learning rate: 4.077E-05 | global batch size: 512 | lm loss: 2.020336E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.297 | TFLOPs: 23.50 | 63: iteration 19100/ 24424 | consumed samples: 9779200 | consumed tokens: 20027801600 | elapsed time per iteration (s): 2.23 | learning rate: 4.069E-05 | global batch size: 512 | lm loss: 2.007109E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.964 | TFLOPs: 23.67 | 63: iteration 19110/ 24424 | consumed samples: 9784320 | consumed tokens: 20038287360 | elapsed time per iteration (s): 2.26 | learning rate: 4.062E-05 | global batch size: 512 | lm loss: 2.007257E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.994 | TFLOPs: 23.37 | 63: iteration 19120/ 24424 | consumed samples: 9789440 | consumed tokens: 20048773120 | elapsed time per iteration (s): 2.24 | learning rate: 4.054E-05 | global batch size: 512 | lm loss: 2.021633E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.414 | TFLOPs: 23.51 | 63: iteration 19130/ 24424 | consumed samples: 9794560 | consumed tokens: 20059258880 | elapsed time per iteration (s): 2.22 | learning rate: 4.047E-05 | global batch size: 512 | lm loss: 1.998686E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.181 | TFLOPs: 23.70 | 63: iteration 19140/ 24424 | consumed samples: 9799680 | consumed tokens: 20069744640 | elapsed time per iteration (s): 2.25 | learning rate: 4.039E-05 | global batch size: 512 | lm loss: 2.003415E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.410 | TFLOPs: 23.41 | 63: iteration 19150/ 24424 | consumed samples: 9804800 | consumed tokens: 20080230400 | elapsed time per iteration (s): 2.25 | learning rate: 4.032E-05 | global batch size: 512 | lm loss: 1.999433E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.186 | TFLOPs: 23.39 | 63: iteration 19160/ 24424 | consumed samples: 9809920 | consumed tokens: 20090716160 | elapsed time per iteration (s): 2.23 | learning rate: 4.025E-05 | global batch size: 512 | lm loss: 2.002090E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.406 | TFLOPs: 23.62 | 63: iteration 19170/ 24424 | consumed samples: 9815040 | consumed tokens: 20101201920 | elapsed time per iteration (s): 2.45 | learning rate: 4.017E-05 | global batch size: 512 | lm loss: 1.979078E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 209.038 | TFLOPs: 21.52 | 63: iteration 19180/ 24424 | consumed samples: 9820160 | consumed tokens: 20111687680 | elapsed time per iteration (s): 2.29 | learning rate: 4.010E-05 | global batch size: 512 | lm loss: 2.004248E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.788 | TFLOPs: 23.04 | 63: iteration 19190/ 24424 | consumed samples: 9825280 | consumed tokens: 20122173440 | elapsed time per iteration (s): 4.25 | learning rate: 4.002E-05 | global batch size: 512 | lm loss: 1.995708E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 120.379 | TFLOPs: 12.39 | 63: iteration 19200/ 24424 | consumed samples: 9830400 | consumed tokens: 20132659200 | elapsed time per iteration (s): 2.23 | learning rate: 3.995E-05 | global batch size: 512 | lm loss: 2.006410E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.896 | TFLOPs: 23.67 | 63: iteration 19210/ 24424 | consumed samples: 9835520 | consumed tokens: 20143144960 | elapsed time per iteration (s): 2.24 | learning rate: 3.988E-05 | global batch size: 512 | lm loss: 2.008500E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.379 | TFLOPs: 23.51 | 63: iteration 19220/ 24424 | consumed samples: 9840640 | consumed tokens: 20153630720 | elapsed time per iteration (s): 2.24 | learning rate: 3.980E-05 | global batch size: 512 | lm loss: 2.003003E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.614 | TFLOPs: 23.53 | 63: iteration 19230/ 24424 | consumed samples: 9845760 | consumed tokens: 20164116480 | elapsed time per iteration (s): 2.26 | learning rate: 3.973E-05 | global batch size: 512 | lm loss: 1.997036E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.221 | TFLOPs: 23.29 | 63: iteration 19240/ 24424 | consumed samples: 9850880 | consumed tokens: 20174602240 | elapsed time per iteration (s): 2.25 | learning rate: 3.966E-05 | global batch size: 512 | lm loss: 1.990028E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.475 | TFLOPs: 23.42 | 63: iteration 19250/ 24424 | consumed samples: 9856000 | consumed tokens: 20185088000 | elapsed time per iteration (s): 2.25 | learning rate: 3.959E-05 | global batch size: 512 | lm loss: 2.003738E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.596 | TFLOPs: 23.43 | 63: iteration 19260/ 24424 | consumed samples: 9861120 | consumed tokens: 20195573760 | elapsed time per iteration (s): 2.27 | learning rate: 3.951E-05 | global batch size: 512 | lm loss: 2.011697E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.179 | TFLOPs: 23.18 | 63: iteration 19270/ 24424 | consumed samples: 9866240 | consumed tokens: 20206059520 | elapsed time per iteration (s): 2.27 | learning rate: 3.944E-05 | global batch size: 512 | lm loss: 2.003966E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.042 | TFLOPs: 23.27 | 63: iteration 19280/ 24424 | consumed samples: 9871360 | consumed tokens: 20216545280 | elapsed time per iteration (s): 2.26 | learning rate: 3.937E-05 | global batch size: 512 | lm loss: 1.993118E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.420 | TFLOPs: 23.31 | 63: iteration 19290/ 24424 | consumed samples: 9876480 | consumed tokens: 20227031040 | elapsed time per iteration (s): 2.27 | learning rate: 3.930E-05 | global batch size: 512 | lm loss: 1.987844E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.776 | TFLOPs: 23.24 | 63: iteration 19300/ 24424 | consumed samples: 9881600 | consumed tokens: 20237516800 | elapsed time per iteration (s): 2.24 | learning rate: 3.922E-05 | global batch size: 512 | lm loss: 2.004180E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.467 | TFLOPs: 23.52 | 63: iteration 19310/ 24424 | consumed samples: 9886720 | consumed tokens: 20248002560 | elapsed time per iteration (s): 2.25 | learning rate: 3.915E-05 | global batch size: 512 | lm loss: 1.997577E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.676 | TFLOPs: 23.44 | 63: iteration 19320/ 24424 | consumed samples: 9891840 | consumed tokens: 20258488320 | elapsed time per iteration (s): 2.29 | learning rate: 3.908E-05 | global batch size: 512 | lm loss: 1.997787E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.656 | TFLOPs: 23.02 | 63: iteration 19330/ 24424 | consumed samples: 9896960 | consumed tokens: 20268974080 | elapsed time per iteration (s): 2.46 | learning rate: 3.901E-05 | global batch size: 512 | lm loss: 1.994676E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 208.305 | TFLOPs: 21.44 | 63: iteration 19340/ 24424 | consumed samples: 9902080 | consumed tokens: 20279459840 | elapsed time per iteration (s): 2.24 | learning rate: 3.894E-05 | global batch size: 512 | lm loss: 2.004967E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.568 | TFLOPs: 23.53 | 63: iteration 19350/ 24424 | consumed samples: 9907200 | consumed tokens: 20289945600 | elapsed time per iteration (s): 2.25 | learning rate: 3.886E-05 | global batch size: 512 | lm loss: 2.000954E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.000 | TFLOPs: 23.47 | 63: iteration 19360/ 24424 | consumed samples: 9912320 | consumed tokens: 20300431360 | elapsed time per iteration (s): 2.23 | learning rate: 3.879E-05 | global batch size: 512 | lm loss: 2.018163E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.813 | TFLOPs: 23.66 | 63: iteration 19370/ 24424 | consumed samples: 9917440 | consumed tokens: 20310917120 | elapsed time per iteration (s): 2.25 | learning rate: 3.872E-05 | global batch size: 512 | lm loss: 1.974858E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.343 | TFLOPs: 23.40 | 63: iteration 19380/ 24424 | consumed samples: 9922560 | consumed tokens: 20321402880 | elapsed time per iteration (s): 2.23 | learning rate: 3.865E-05 | global batch size: 512 | lm loss: 1.985131E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.716 | TFLOPs: 23.65 | 63: iteration 19390/ 24424 | consumed samples: 9927680 | consumed tokens: 20331888640 | elapsed time per iteration (s): 2.28 | learning rate: 3.858E-05 | global batch size: 512 | lm loss: 1.983229E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.782 | TFLOPs: 23.14 | 63: iteration 19400/ 24424 | consumed samples: 9932800 | consumed tokens: 20342374400 | elapsed time per iteration (s): 2.25 | learning rate: 3.851E-05 | global batch size: 512 | lm loss: 1.997697E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.384 | TFLOPs: 23.41 | 63: iteration 19410/ 24424 | consumed samples: 9937920 | consumed tokens: 20352860160 | elapsed time per iteration (s): 2.51 | learning rate: 3.844E-05 | global batch size: 512 | lm loss: 1.997666E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 204.102 | TFLOPs: 21.01 | 63: iteration 19420/ 24424 | consumed samples: 9943040 | consumed tokens: 20363345920 | elapsed time per iteration (s): 2.25 | learning rate: 3.837E-05 | global batch size: 512 | lm loss: 1.998629E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.757 | TFLOPs: 23.45 | 63: iteration 19430/ 24424 | consumed samples: 9948160 | consumed tokens: 20373831680 | elapsed time per iteration (s): 2.24 | learning rate: 3.829E-05 | global batch size: 512 | lm loss: 1.984257E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.066 | TFLOPs: 23.48 | 63: iteration 19440/ 24424 | consumed samples: 9953280 | consumed tokens: 20384317440 | elapsed time per iteration (s): 2.26 | learning rate: 3.822E-05 | global batch size: 512 | lm loss: 2.015608E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.308 | TFLOPs: 23.30 | 63: iteration 19450/ 24424 | consumed samples: 9958400 | consumed tokens: 20394803200 | elapsed time per iteration (s): 2.25 | learning rate: 3.815E-05 | global batch size: 512 | lm loss: 1.994294E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.405 | TFLOPs: 23.41 | 63: iteration 19460/ 24424 | consumed samples: 9963520 | consumed tokens: 20405288960 | elapsed time per iteration (s): 2.23 | learning rate: 3.808E-05 | global batch size: 512 | lm loss: 1.999931E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.781 | TFLOPs: 23.65 | 63: iteration 19470/ 24424 | consumed samples: 9968640 | consumed tokens: 20415774720 | elapsed time per iteration (s): 2.26 | learning rate: 3.801E-05 | global batch size: 512 | lm loss: 1.996790E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.694 | TFLOPs: 23.34 | 63: iteration 19480/ 24424 | consumed samples: 9973760 | consumed tokens: 20426260480 | elapsed time per iteration (s): 2.37 | learning rate: 3.794E-05 | global batch size: 512 | lm loss: 1.997833E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 215.892 | TFLOPs: 22.23 | 63: iteration 19490/ 24424 | consumed samples: 9978880 | consumed tokens: 20436746240 | elapsed time per iteration (s): 2.32 | learning rate: 3.787E-05 | global batch size: 512 | lm loss: 1.999903E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.159 | TFLOPs: 22.77 | 63: iteration 19500/ 24424 | consumed samples: 9984000 | consumed tokens: 20447232000 | elapsed time per iteration (s): 2.25 | learning rate: 3.780E-05 | global batch size: 512 | lm loss: 1.992885E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.995 | TFLOPs: 23.47 | 63: iteration 19510/ 24424 | consumed samples: 9989120 | consumed tokens: 20457717760 | elapsed time per iteration (s): 2.30 | learning rate: 3.773E-05 | global batch size: 512 | lm loss: 2.028759E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.815 | TFLOPs: 22.94 | 63: iteration 19520/ 24424 | consumed samples: 9994240 | consumed tokens: 20468203520 | elapsed time per iteration (s): 2.24 | learning rate: 3.766E-05 | global batch size: 512 | lm loss: 1.987625E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.979 | TFLOPs: 23.57 | 63: iteration 19530/ 24424 | consumed samples: 9999360 | consumed tokens: 20478689280 | elapsed time per iteration (s): 2.23 | learning rate: 3.759E-05 | global batch size: 512 | lm loss: 1.994380E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.343 | TFLOPs: 23.61 | 63: iteration 19540/ 24424 | consumed samples: 10004480 | consumed tokens: 20489175040 | elapsed time per iteration (s): 2.26 | learning rate: 3.752E-05 | global batch size: 512 | lm loss: 1.989968E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.600 | TFLOPs: 23.33 | 63: iteration 19550/ 24424 | consumed samples: 10009600 | consumed tokens: 20499660800 | elapsed time per iteration (s): 2.26 | learning rate: 3.746E-05 | global batch size: 512 | lm loss: 1.995988E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.403 | TFLOPs: 23.31 | 63: iteration 19560/ 24424 | consumed samples: 10014720 | consumed tokens: 20510146560 | elapsed time per iteration (s): 2.24 | learning rate: 3.739E-05 | global batch size: 512 | lm loss: 1.993623E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.178 | TFLOPs: 23.49 | 63: iteration 19570/ 24424 | consumed samples: 10019840 | consumed tokens: 20520632320 | elapsed time per iteration (s): 2.27 | learning rate: 3.732E-05 | global batch size: 512 | lm loss: 1.994511E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.511 | TFLOPs: 23.22 | 63: iteration 19580/ 24424 | consumed samples: 10024960 | consumed tokens: 20531118080 | elapsed time per iteration (s): 2.23 | learning rate: 3.725E-05 | global batch size: 512 | lm loss: 1.990379E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.226 | TFLOPs: 23.60 | 63: iteration 19590/ 24424 | consumed samples: 10030080 | consumed tokens: 20541603840 | elapsed time per iteration (s): 2.23 | learning rate: 3.718E-05 | global batch size: 512 | lm loss: 2.005484E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.961 | TFLOPs: 23.67 | 63: iteration 19600/ 24424 | consumed samples: 10035200 | consumed tokens: 20552089600 | elapsed time per iteration (s): 2.24 | learning rate: 3.711E-05 | global batch size: 512 | lm loss: 1.985529E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.626 | TFLOPs: 23.54 | 63: iteration 19610/ 24424 | consumed samples: 10040320 | consumed tokens: 20562575360 | elapsed time per iteration (s): 2.26 | learning rate: 3.704E-05 | global batch size: 512 | lm loss: 1.990390E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.373 | TFLOPs: 23.30 | 63: iteration 19620/ 24424 | consumed samples: 10045440 | consumed tokens: 20573061120 | elapsed time per iteration (s): 2.24 | learning rate: 3.697E-05 | global batch size: 512 | lm loss: 1.981535E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.605 | TFLOPs: 23.53 | 63: iteration 19630/ 24424 | consumed samples: 10050560 | consumed tokens: 20583546880 | elapsed time per iteration (s): 2.22 | learning rate: 3.691E-05 | global batch size: 512 | lm loss: 1.972345E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.162 | TFLOPs: 23.69 | 63: iteration 19640/ 24424 | consumed samples: 10055680 | consumed tokens: 20594032640 | elapsed time per iteration (s): 2.23 | learning rate: 3.684E-05 | global batch size: 512 | lm loss: 1.995182E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.330 | TFLOPs: 23.61 | 63: iteration 19650/ 24424 | consumed samples: 10060800 | consumed tokens: 20604518400 | elapsed time per iteration (s): 2.23 | learning rate: 3.677E-05 | global batch size: 512 | lm loss: 2.006569E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.983 | TFLOPs: 23.68 | 63: iteration 19660/ 24424 | consumed samples: 10065920 | consumed tokens: 20615004160 | elapsed time per iteration (s): 2.25 | learning rate: 3.670E-05 | global batch size: 512 | lm loss: 2.000378E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.083 | TFLOPs: 23.38 | 63: iteration 19670/ 24424 | consumed samples: 10071040 | consumed tokens: 20625489920 | elapsed time per iteration (s): 2.38 | learning rate: 3.663E-05 | global batch size: 512 | lm loss: 2.001244E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 215.559 | TFLOPs: 22.19 | 63: iteration 19680/ 24424 | consumed samples: 10076160 | consumed tokens: 20635975680 | elapsed time per iteration (s): 2.23 | learning rate: 3.657E-05 | global batch size: 512 | lm loss: 1.992853E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.695 | TFLOPs: 23.65 | 63: iteration 19690/ 24424 | consumed samples: 10081280 | consumed tokens: 20646461440 | elapsed time per iteration (s): 2.24 | learning rate: 3.650E-05 | global batch size: 512 | lm loss: 1.990442E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.163 | TFLOPs: 23.49 | 63: iteration 19700/ 24424 | consumed samples: 10086400 | consumed tokens: 20656947200 | elapsed time per iteration (s): 2.31 | learning rate: 3.643E-05 | global batch size: 512 | lm loss: 1.997480E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.023 | TFLOPs: 22.86 | 63: iteration 19710/ 24424 | consumed samples: 10091520 | consumed tokens: 20667432960 | elapsed time per iteration (s): 2.25 | learning rate: 3.636E-05 | global batch size: 512 | lm loss: 1.971622E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.041 | TFLOPs: 23.48 | 63: iteration 19720/ 24424 | consumed samples: 10096640 | consumed tokens: 20677918720 | elapsed time per iteration (s): 2.23 | learning rate: 3.630E-05 | global batch size: 512 | lm loss: 2.001839E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.833 | TFLOPs: 23.66 | 63: iteration 19730/ 24424 | consumed samples: 10101760 | consumed tokens: 20688404480 | elapsed time per iteration (s): 2.24 | learning rate: 3.623E-05 | global batch size: 512 | lm loss: 2.014282E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.704 | TFLOPs: 23.54 | 63: iteration 19740/ 24424 | consumed samples: 10106880 | consumed tokens: 20698890240 | elapsed time per iteration (s): 2.25 | learning rate: 3.616E-05 | global batch size: 512 | lm loss: 2.002977E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.418 | TFLOPs: 23.41 | 63: iteration 19750/ 24424 | consumed samples: 10112000 | consumed tokens: 20709376000 | elapsed time per iteration (s): 2.28 | learning rate: 3.610E-05 | global batch size: 512 | lm loss: 2.012119E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.774 | TFLOPs: 23.14 | 63: iteration 19760/ 24424 | consumed samples: 10117120 | consumed tokens: 20719861760 | elapsed time per iteration (s): 2.29 | learning rate: 3.603E-05 | global batch size: 512 | lm loss: 1.983634E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.775 | TFLOPs: 23.04 | 63: iteration 19770/ 24424 | consumed samples: 10122240 | consumed tokens: 20730347520 | elapsed time per iteration (s): 2.25 | learning rate: 3.596E-05 | global batch size: 512 | lm loss: 1.991476E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.408 | TFLOPs: 23.41 | 63: iteration 19780/ 24424 | consumed samples: 10127360 | consumed tokens: 20740833280 | elapsed time per iteration (s): 2.24 | learning rate: 3.590E-05 | global batch size: 512 | lm loss: 1.976901E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.250 | TFLOPs: 23.50 | 63: iteration 19790/ 24424 | consumed samples: 10132480 | consumed tokens: 20751319040 | elapsed time per iteration (s): 2.24 | learning rate: 3.583E-05 | global batch size: 512 | lm loss: 1.987488E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.089 | TFLOPs: 23.48 | 63: iteration 19800/ 24424 | consumed samples: 10137600 | consumed tokens: 20761804800 | elapsed time per iteration (s): 4.44 | learning rate: 3.576E-05 | global batch size: 512 | lm loss: 1.986633E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 115.349 | TFLOPs: 11.87 | 63: iteration 19810/ 24424 | consumed samples: 10142720 | consumed tokens: 20772290560 | elapsed time per iteration (s): 2.26 | learning rate: 3.570E-05 | global batch size: 512 | lm loss: 2.007246E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.153 | TFLOPs: 23.28 | 63: iteration 19820/ 24424 | consumed samples: 10147840 | consumed tokens: 20782776320 | elapsed time per iteration (s): 2.25 | learning rate: 3.563E-05 | global batch size: 512 | lm loss: 1.996902E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.174 | TFLOPs: 23.39 | 63: iteration 19830/ 24424 | consumed samples: 10152960 | consumed tokens: 20793262080 | elapsed time per iteration (s): 2.24 | learning rate: 3.557E-05 | global batch size: 512 | lm loss: 2.004587E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.707 | TFLOPs: 23.54 | 63: iteration 19840/ 24424 | consumed samples: 10158080 | consumed tokens: 20803747840 | elapsed time per iteration (s): 2.26 | learning rate: 3.550E-05 | global batch size: 512 | lm loss: 1.978350E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.043 | TFLOPs: 23.37 | 63: iteration 19850/ 24424 | consumed samples: 10163200 | consumed tokens: 20814233600 | elapsed time per iteration (s): 2.23 | learning rate: 3.543E-05 | global batch size: 512 | lm loss: 1.998236E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.950 | TFLOPs: 23.67 | 63: iteration 19860/ 24424 | consumed samples: 10168320 | consumed tokens: 20824719360 | elapsed time per iteration (s): 2.24 | learning rate: 3.537E-05 | global batch size: 512 | lm loss: 2.009320E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.883 | TFLOPs: 23.56 | 63: iteration 19870/ 24424 | consumed samples: 10173440 | consumed tokens: 20835205120 | elapsed time per iteration (s): 2.24 | learning rate: 3.530E-05 | global batch size: 512 | lm loss: 1.990513E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.467 | TFLOPs: 23.52 | 63: iteration 19880/ 24424 | consumed samples: 10178560 | consumed tokens: 20845690880 | elapsed time per iteration (s): 2.28 | learning rate: 3.524E-05 | global batch size: 512 | lm loss: 1.979509E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.527 | TFLOPs: 23.11 | 63: iteration 19890/ 24424 | consumed samples: 10183680 | consumed tokens: 20856176640 | elapsed time per iteration (s): 2.30 | learning rate: 3.517E-05 | global batch size: 512 | lm loss: 1.989194E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.927 | TFLOPs: 22.95 | 63: iteration 19900/ 24424 | consumed samples: 10188800 | consumed tokens: 20866662400 | elapsed time per iteration (s): 2.25 | learning rate: 3.511E-05 | global batch size: 512 | lm loss: 1.996355E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.843 | TFLOPs: 23.46 | 63: iteration 19910/ 24424 | consumed samples: 10193920 | consumed tokens: 20877148160 | elapsed time per iteration (s): 2.23 | learning rate: 3.504E-05 | global batch size: 512 | lm loss: 1.987444E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.848 | TFLOPs: 23.66 | 63: iteration 19920/ 24424 | consumed samples: 10199040 | consumed tokens: 20887633920 | elapsed time per iteration (s): 2.27 | learning rate: 3.498E-05 | global batch size: 512 | lm loss: 2.000443E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.819 | TFLOPs: 23.25 | 63: iteration 19930/ 24424 | consumed samples: 10204160 | consumed tokens: 20898119680 | elapsed time per iteration (s): 2.25 | learning rate: 3.491E-05 | global batch size: 512 | lm loss: 1.990199E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.365 | TFLOPs: 23.41 | 63: iteration 19940/ 24424 | consumed samples: 10209280 | consumed tokens: 20908605440 | elapsed time per iteration (s): 2.23 | learning rate: 3.485E-05 | global batch size: 512 | lm loss: 1.995388E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.721 | TFLOPs: 23.65 | 63: iteration 19950/ 24424 | consumed samples: 10214400 | consumed tokens: 20919091200 | elapsed time per iteration (s): 2.24 | learning rate: 3.479E-05 | global batch size: 512 | lm loss: 1.964611E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.235 | TFLOPs: 23.50 | 63: iteration 19960/ 24424 | consumed samples: 10219520 | consumed tokens: 20929576960 | elapsed time per iteration (s): 2.27 | learning rate: 3.472E-05 | global batch size: 512 | lm loss: 1.992331E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.495 | TFLOPs: 23.21 | 63: iteration 19970/ 24424 | consumed samples: 10224640 | consumed tokens: 20940062720 | elapsed time per iteration (s): 2.27 | learning rate: 3.466E-05 | global batch size: 512 | lm loss: 1.989261E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.007 | TFLOPs: 23.27 | 63: iteration 19980/ 24424 | consumed samples: 10229760 | consumed tokens: 20950548480 | elapsed time per iteration (s): 2.78 | learning rate: 3.459E-05 | global batch size: 512 | lm loss: 1.985614E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.968 | TFLOPs: 18.94 | 63: iteration 19990/ 24424 | consumed samples: 10234880 | consumed tokens: 20961034240 | elapsed time per iteration (s): 2.25 | learning rate: 3.453E-05 | global batch size: 512 | lm loss: 1.973769E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.211 | TFLOPs: 23.39 | 0: [2022-11-26 06:40:35,388] [INFO] [logging.py:68:log_dist] [Rank 0] step=20000, skipped=0, lr=[3.4466688948606354e-05, 3.4466688948606354e-05, 3.4466688948606354e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 20000/ 24424 | consumed samples: 10240000 | consumed tokens: 20971520000 | elapsed time per iteration (s): 2.26 | learning rate: 3.447E-05 | global batch size: 512 | lm loss: 2.001763E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.666 | TFLOPs: 23.33 | 0: steps: 20000 loss: 1.9724 iter time (s): 2.318 samples/sec: 220.840 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 20000 | lm loss value: 1.937014E+00 | lm loss PPL: 6.938001E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 20000 to checkpoints_3b9 0: [2022-11-26 06:40:36,227] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is begin to save! 0: [2022-11-26 06:40:36,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_01-model_00-model_states.pt... 32: [2022-11-26 06:40:36,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_21-model_00-model_states.pt... 32: [2022-11-26 06:40:36,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_21-model_00-model_states.pt. 32: [2022-11-26 06:40:36,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_22-model_00-model_states.pt... 0: [2022-11-26 06:40:36,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_01-model_00-model_states.pt. 0: [2022-11-26 06:40:36,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_03-model_00-model_states.pt... 32: [2022-11-26 06:40:36,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_22-model_00-model_states.pt. 32: [2022-11-26 06:40:36,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_23-model_00-model_states.pt... 0: [2022-11-26 06:40:36,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_03-model_00-model_states.pt. 0: [2022-11-26 06:40:36,885] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_04-model_00-model_states.pt... 32: [2022-11-26 06:40:36,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_23-model_00-model_states.pt. 32: [2022-11-26 06:40:36,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_24-model_00-model_states.pt... 0: [2022-11-26 06:40:37,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_04-model_00-model_states.pt. 0: [2022-11-26 06:40:37,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_05-model_00-model_states.pt... 32: [2022-11-26 06:40:37,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_24-model_00-model_states.pt. 32: [2022-11-26 06:40:37,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_25-model_00-model_states.pt... 0: [2022-11-26 06:40:37,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_05-model_00-model_states.pt. 0: [2022-11-26 06:40:37,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_06-model_00-model_states.pt... 32: [2022-11-26 06:40:37,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_25-model_00-model_states.pt. 32: [2022-11-26 06:40:37,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_26-model_00-model_states.pt... 0: [2022-11-26 06:40:37,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_06-model_00-model_states.pt. 0: [2022-11-26 06:40:37,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_07-model_00-model_states.pt... 32: [2022-11-26 06:40:37,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_26-model_00-model_states.pt. 32: [2022-11-26 06:40:37,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_27-model_00-model_states.pt... 0: [2022-11-26 06:40:37,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_07-model_00-model_states.pt. 0: [2022-11-26 06:40:37,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_08-model_00-model_states.pt... 32: [2022-11-26 06:40:37,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_27-model_00-model_states.pt. 32: [2022-11-26 06:40:37,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_28-model_00-model_states.pt... 0: [2022-11-26 06:40:38,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_08-model_00-model_states.pt. 0: [2022-11-26 06:40:38,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_09-model_00-model_states.pt... 32: [2022-11-26 06:40:38,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_28-model_00-model_states.pt. 32: [2022-11-26 06:40:38,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_29-model_00-model_states.pt... 0: [2022-11-26 06:40:38,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_09-model_00-model_states.pt. 0: [2022-11-26 06:40:38,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_10-model_00-model_states.pt... 32: [2022-11-26 06:40:38,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_29-model_00-model_states.pt. 32: [2022-11-26 06:40:38,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_30-model_00-model_states.pt... 0: [2022-11-26 06:40:38,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_10-model_00-model_states.pt. 0: [2022-11-26 06:40:38,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_11-model_00-model_states.pt... 32: [2022-11-26 06:40:38,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_30-model_00-model_states.pt. 32: [2022-11-26 06:40:38,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_31-model_00-model_states.pt... 0: [2022-11-26 06:40:38,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_11-model_00-model_states.pt. 0: [2022-11-26 06:40:38,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_12-model_00-model_states.pt... 32: [2022-11-26 06:40:38,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_31-model_00-model_states.pt. 32: [2022-11-26 06:40:38,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_32-model_00-model_states.pt... 0: [2022-11-26 06:40:38,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_12-model_00-model_states.pt. 0: [2022-11-26 06:40:38,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_13-model_00-model_states.pt... 32: [2022-11-26 06:40:39,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_32-model_00-model_states.pt. 32: [2022-11-26 06:40:39,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_33-model_00-model_states.pt... 0: [2022-11-26 06:40:39,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_13-model_00-model_states.pt. 0: [2022-11-26 06:40:39,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_14-model_00-model_states.pt... 32: [2022-11-26 06:40:39,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_33-model_00-model_states.pt. 32: [2022-11-26 06:40:39,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_34-model_00-model_states.pt... 0: [2022-11-26 06:40:39,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_14-model_00-model_states.pt. 0: [2022-11-26 06:40:39,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_15-model_00-model_states.pt... 32: [2022-11-26 06:40:39,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_34-model_00-model_states.pt. 32: [2022-11-26 06:40:39,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_35-model_00-model_states.pt... 0: [2022-11-26 06:40:39,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_15-model_00-model_states.pt. 0: [2022-11-26 06:40:39,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_16-model_00-model_states.pt... 32: [2022-11-26 06:40:39,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_35-model_00-model_states.pt. 32: [2022-11-26 06:40:39,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_36-model_00-model_states.pt... 0: [2022-11-26 06:40:39,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_16-model_00-model_states.pt. 0: [2022-11-26 06:40:39,855] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_17-model_00-model_states.pt... 32: [2022-11-26 06:40:40,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_36-model_00-model_states.pt. 32: [2022-11-26 06:40:40,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_37-model_00-model_states.pt... 0: [2022-11-26 06:40:40,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_17-model_00-model_states.pt. 0: [2022-11-26 06:40:40,074] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_18-model_00-model_states.pt... 32: [2022-11-26 06:40:40,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_37-model_00-model_states.pt. 32: [2022-11-26 06:40:40,244] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_38-model_00-model_states.pt... 0: [2022-11-26 06:40:40,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_18-model_00-model_states.pt. 0: [2022-11-26 06:40:40,294] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_19-model_00-model_states.pt... 32: [2022-11-26 06:40:40,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_38-model_00-model_states.pt. 32: [2022-11-26 06:40:40,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_40-model_00-model_states.pt... 32: [2022-11-26 06:40:40,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_40-model_00-model_states.pt. 32: [2022-11-26 06:40:40,473] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/mp_rank_01_model_states.pt... 32: [2022-11-26 06:40:40,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/mp_rank_01_model_states.pt. 0: [2022-11-26 06:40:40,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_19-model_00-model_states.pt. 0: [2022-11-26 06:40:40,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/layer_20-model_00-model_states.pt... 0: [2022-11-26 06:40:40,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/layer_20-model_00-model_states.pt. 0: [2022-11-26 06:40:40,727] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step20000/mp_rank_00_model_states.pt 0: [2022-11-26 06:40:40,727] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/mp_rank_00_model_states.pt... 0: [2022-11-26 06:40:40,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/mp_rank_00_model_states.pt. 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 38: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 55: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 58: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 37: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 53: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 57: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 63: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 44: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 48: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 47: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 14: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 9: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 27: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 1: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 28: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 19: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 18: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 8: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 29: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 16: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 23: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 7: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 22: [2022-11-26 06:40:40,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step20000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 0: [2022-11-26 06:40:40,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:40:40,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 06:40:40,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:40,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:40,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:40,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:40,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:40,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:40,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:40,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:40,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:40:41,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 45: [2022-11-26 06:40:41,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 10: [2022-11-26 06:40:41,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 45: [2022-11-26 06:40:41,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 10: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 7: [2022-11-26 06:40:41,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 53: [2022-11-26 06:40:41,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 06:40:41,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:40:41,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:40:41,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:40:41,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:40:41,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 37: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 13: [2022-11-26 06:40:41,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 37: [2022-11-26 06:40:41,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 16: [2022-11-26 06:40:41,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 29: [2022-11-26 06:40:41,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:40:41,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:40:41,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 47: [2022-11-26 06:40:41,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:40:41,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:41,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:41,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 42: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 24: [2022-11-26 06:40:41,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 60: [2022-11-26 06:40:41,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 24: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:40:41,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 13: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:41,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 11: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 46: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 23: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:41,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:40:41,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:40:41,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 12: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 48: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 11: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 48: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:40:41,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:40:41,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 22: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 56: [2022-11-26 06:40:41,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 22: [2022-11-26 06:40:41,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 54: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 22: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:40:41,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:40:41,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 06:40:41,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 61: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 37: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 7: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 37: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:40:41,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:40:41,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 06:40:41,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:40:41,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 0: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 42: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 0: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 19: [2022-11-26 06:40:41,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 33: [2022-11-26 06:40:41,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 11: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 49: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 44: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 11: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 11: [2022-11-26 06:40:41,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 44: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 44: [2022-11-26 06:40:41,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 11: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 29: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 13: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 38: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 13: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:40:41,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:40:41,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:40:41,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:40:41,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 06:40:41,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 29: [2022-11-26 06:40:41,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:40:41,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:41,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:41,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:40:41,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:40:41,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:40:41,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:40:41,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 34: [2022-11-26 06:40:41,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:40:41,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 13: [2022-11-26 06:40:41,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:40:41,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:40:41,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 28: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 52: [2022-11-26 06:40:41,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 28: [2022-11-26 06:40:41,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 52: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 28: [2022-11-26 06:40:41,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 28: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 28: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:40:41,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 06:40:41,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:40:41,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 06:40:41,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 06:40:41,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:40:41,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:40:41,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:40:41,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 45: [2022-11-26 06:40:41,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 17: [2022-11-26 06:40:41,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 06:40:41,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:40:41,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:40:41,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 06:40:41,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 40: [2022-11-26 06:40:41,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 0: [2022-11-26 06:40:41,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 40: [2022-11-26 06:40:41,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 0: [2022-11-26 06:40:41,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:41,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:40:41,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 06:40:41,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:40:41,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 06:40:41,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:40:41,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 56: [2022-11-26 06:40:41,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:40:41,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 29: [2022-11-26 06:40:41,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 13: [2022-11-26 06:40:41,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:40:41,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:40:41,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:41,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 06:40:41,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 57: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 3: [2022-11-26 06:40:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 57: [2022-11-26 06:40:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 3: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 06:40:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:40:41,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:40:41,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:40:41,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:40:41,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:40:41,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:40:41,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:40:41,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 06:40:41,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:40:41,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 16: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 37: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:40:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 06:40:41,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:41,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:41,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:41,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:40:41,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:40:41,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 06:40:41,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 06:40:41,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:40:41,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:41,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 29: [2022-11-26 06:40:41,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 13: [2022-11-26 06:40:41,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:40:41,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:41,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 06:40:41,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:40:41,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:40:41,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:40:41,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 06:40:41,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 28: [2022-11-26 06:40:41,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:40:41,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:40:41,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:40:41,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:40:41,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:40:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 06:40:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:40:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 06:40:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 06:40:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 48: [2022-11-26 06:40:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 24: [2022-11-26 06:40:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:40:41,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 35: [2022-11-26 06:40:41,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 21: [2022-11-26 06:40:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 35: [2022-11-26 06:40:41,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:40:41,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 06:40:41,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 40: [2022-11-26 06:40:41,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:41,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:41,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:40:41,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 06:40:41,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 29: [2022-11-26 06:40:41,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:40:41,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 06:40:41,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 06:40:41,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:41,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 56: [2022-11-26 06:40:41,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 2: [2022-11-26 06:40:41,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 06:40:41,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,257] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,257] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 06:40:41,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 06:40:41,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 06:40:41,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:40:41,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 41: [2022-11-26 06:40:41,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 3: [2022-11-26 06:40:41,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 13: [2022-11-26 06:40:41,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 06:40:41,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 28: [2022-11-26 06:40:41,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 24: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 06:40:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 17: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:40:41,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 39: [2022-11-26 06:40:41,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 31: [2022-11-26 06:40:41,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 39: [2022-11-26 06:40:41,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 25: [2022-11-26 06:40:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:40:41,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 06:40:41,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 06:40:41,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 06:40:41,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 06:40:41,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:40:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 06:40:41,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:40:41,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 06:40:41,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 06:40:41,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 06:40:41,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 06:40:41,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 40: [2022-11-26 06:40:41,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 14: [2022-11-26 06:40:41,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 40: [2022-11-26 06:40:41,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 34: [2022-11-26 06:40:41,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 40: [2022-11-26 06:40:41,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 34: [2022-11-26 06:40:41,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 14: [2022-11-26 06:40:41,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 06:40:41,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 4: [2022-11-26 06:40:41,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 06:40:41,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 06:40:41,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: [2022-11-26 06:40:41,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 06:40:41,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 06:40:41,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 16: [2022-11-26 06:40:41,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 06:40:41,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 06:40:41,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 18: [2022-11-26 06:40:41,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 06:40:41,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 06:40:41,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 4: [2022-11-26 06:40:41,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 11: [2022-11-26 06:40:41,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 4: [2022-11-26 06:40:41,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 11: [2022-11-26 06:40:41,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 06:40:41,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 12: [2022-11-26 06:40:41,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 12: [2022-11-26 06:40:41,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 06:40:41,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 2: [2022-11-26 06:40:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-26 06:40:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 1: [2022-11-26 06:40:41,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 06:40:41,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 06:40:41,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 32: [2022-11-26 06:40:41,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 06:40:41,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 06:40:41,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 22: [2022-11-26 06:40:41,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 27: [2022-11-26 06:40:41,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 22: [2022-11-26 06:40:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 06:40:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 27: [2022-11-26 06:40:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 06:40:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 06:40:41,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 13: [2022-11-26 06:40:41,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 06:40:41,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 06:40:41,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 8: [2022-11-26 06:40:41,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 06:40:41,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 06:40:41,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 47: [2022-11-26 06:40:41,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 06:40:41,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 06:40:41,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 3: [2022-11-26 06:40:41,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 06:40:41,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 06:40:41,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:40:41,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 62: [2022-11-26 06:40:41,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 06:40:41,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 06:40:41,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 61: [2022-11-26 06:40:41,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-26 06:40:41,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 06:40:41,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 28: [2022-11-26 06:40:41,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 6: [2022-11-26 06:40:41,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 28: [2022-11-26 06:40:41,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 6: [2022-11-26 06:40:41,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 28: [2022-11-26 06:40:41,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 6: [2022-11-26 06:40:41,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 06:40:41,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 06:40:41,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 24: [2022-11-26 06:40:41,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 50: [2022-11-26 06:40:41,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 24: [2022-11-26 06:40:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 06:40:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 41: [2022-11-26 06:40:41,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 06:40:41,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 06:40:41,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 53: [2022-11-26 06:40:41,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 06:40:41,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 06:40:41,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 7: [2022-11-26 06:40:41,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 06:40:41,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 06:40:41,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 06:40:41,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 45: [2022-11-26 06:40:41,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 06:40:41,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 06:40:41,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 42: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 06:40:41,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 59: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 59: [2022-11-26 06:40:41,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 48: [2022-11-26 06:40:41,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 48: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 37: [2022-11-26 06:40:41,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 06:40:41,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 06:40:41,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 49: [2022-11-26 06:40:41,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 06:40:41,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 58: [2022-11-26 06:40:41,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 5: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 19: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 33: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 5: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 19: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 5: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 19: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 33: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 9: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 06:40:41,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 33: [2022-11-26 06:40:41,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 9: [2022-11-26 06:40:41,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 26: [2022-11-26 06:40:41,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 06:40:41,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 06:40:41,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:40:41,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 50: [2022-11-26 06:40:41,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 06:40:41,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 25: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 36: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 06:40:41,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 25: [2022-11-26 06:40:41,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 36: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 23: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 06:40:41,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 06:40:41,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 57: [2022-11-26 06:40:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 06:40:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 06:40:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 30: [2022-11-26 06:40:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 06:40:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 44: [2022-11-26 06:40:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 30: [2022-11-26 06:40:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 44: [2022-11-26 06:40:41,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 06:40:41,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 06:40:41,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 06:40:41,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 39: [2022-11-26 06:40:41,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 31: [2022-11-26 06:40:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 31: [2022-11-26 06:40:41,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 06:40:41,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 10: [2022-11-26 06:40:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 06:40:41,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 06:40:41,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 38: [2022-11-26 06:40:41,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 38: [2022-11-26 06:40:41,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 06:40:41,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 58: [2022-11-26 06:40:41,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 06:40:41,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 55: [2022-11-26 06:40:41,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 06:40:41,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 20: [2022-11-26 06:40:41,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 06:40:41,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 06:40:41,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 21: [2022-11-26 06:40:41,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 06:40:41,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 06:40:41,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 56: [2022-11-26 06:40:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 06:40:41,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 06:40:41,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 35: [2022-11-26 06:40:41,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 06:40:41,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 06:40:41,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 15: [2022-11-26 06:40:41,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 46: [2022-11-26 06:40:41,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 15: [2022-11-26 06:40:41,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 46: [2022-11-26 06:40:41,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 15: [2022-11-26 06:40:41,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 46: [2022-11-26 06:40:41,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 63: [2022-11-26 06:40:41,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 06:40:41,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 06:40:41,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 60: [2022-11-26 06:40:41,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 06:40:41,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 06:40:41,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 54: [2022-11-26 06:40:41,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 06:40:41,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 06:40:41,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 51: [2022-11-26 06:40:41,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 06:40:41,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 06:40:41,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 52: [2022-11-26 06:40:41,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 06:40:41,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 06:40:41,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 43: [2022-11-26 06:40:41,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 06:40:41,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step20000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 06:40:41,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! 0: successfully saved checkpoint at iteration 20000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5187.61 63: iteration 20010/ 24424 | consumed samples: 10245120 | consumed tokens: 20982005760 | elapsed time per iteration (s): 2.83 | learning rate: 3.440E-05 | global batch size: 512 | lm loss: 2.016253E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.971 | TFLOPs: 18.63 | 63: iteration 20020/ 24424 | consumed samples: 10250240 | consumed tokens: 20992491520 | elapsed time per iteration (s): 2.27 | learning rate: 3.434E-05 | global batch size: 512 | lm loss: 2.003604E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.469 | TFLOPs: 23.21 | 63: iteration 20030/ 24424 | consumed samples: 10255360 | consumed tokens: 21002977280 | elapsed time per iteration (s): 2.23 | learning rate: 3.428E-05 | global batch size: 512 | lm loss: 1.997192E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.632 | TFLOPs: 23.64 | 63: iteration 20040/ 24424 | consumed samples: 10260480 | consumed tokens: 21013463040 | elapsed time per iteration (s): 2.26 | learning rate: 3.421E-05 | global batch size: 512 | lm loss: 1.976903E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.119 | TFLOPs: 23.28 | 63: iteration 20050/ 24424 | consumed samples: 10265600 | consumed tokens: 21023948800 | elapsed time per iteration (s): 2.23 | learning rate: 3.415E-05 | global batch size: 512 | lm loss: 2.004202E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.825 | TFLOPs: 23.66 | 63: iteration 20060/ 24424 | consumed samples: 10270720 | consumed tokens: 21034434560 | elapsed time per iteration (s): 2.25 | learning rate: 3.409E-05 | global batch size: 512 | lm loss: 1.994117E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.358 | TFLOPs: 23.41 | 63: iteration 20070/ 24424 | consumed samples: 10275840 | consumed tokens: 21044920320 | elapsed time per iteration (s): 2.31 | learning rate: 3.402E-05 | global batch size: 512 | lm loss: 1.976275E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.444 | TFLOPs: 22.80 | 63: iteration 20080/ 24424 | consumed samples: 10280960 | consumed tokens: 21055406080 | elapsed time per iteration (s): 2.25 | learning rate: 3.396E-05 | global batch size: 512 | lm loss: 1.979906E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.722 | TFLOPs: 23.44 | 63: iteration 20090/ 24424 | consumed samples: 10286080 | consumed tokens: 21065891840 | elapsed time per iteration (s): 2.27 | learning rate: 3.390E-05 | global batch size: 512 | lm loss: 1.993798E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.693 | TFLOPs: 23.23 | 63: iteration 20100/ 24424 | consumed samples: 10291200 | consumed tokens: 21076377600 | elapsed time per iteration (s): 2.23 | learning rate: 3.384E-05 | global batch size: 512 | lm loss: 1.995413E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.832 | TFLOPs: 23.66 | 63: iteration 20110/ 24424 | consumed samples: 10296320 | consumed tokens: 21086863360 | elapsed time per iteration (s): 2.35 | learning rate: 3.378E-05 | global batch size: 512 | lm loss: 1.998320E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.031 | TFLOPs: 22.45 | 63: iteration 20120/ 24424 | consumed samples: 10301440 | consumed tokens: 21097349120 | elapsed time per iteration (s): 2.24 | learning rate: 3.371E-05 | global batch size: 512 | lm loss: 1.983428E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.963 | TFLOPs: 23.57 | 63: iteration 20130/ 24424 | consumed samples: 10306560 | consumed tokens: 21107834880 | elapsed time per iteration (s): 2.23 | learning rate: 3.365E-05 | global batch size: 512 | lm loss: 1.993152E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.400 | TFLOPs: 23.62 | 63: iteration 20140/ 24424 | consumed samples: 10311680 | consumed tokens: 21118320640 | elapsed time per iteration (s): 2.24 | learning rate: 3.359E-05 | global batch size: 512 | lm loss: 1.984899E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.661 | TFLOPs: 23.54 | 63: iteration 20150/ 24424 | consumed samples: 10316800 | consumed tokens: 21128806400 | elapsed time per iteration (s): 2.23 | learning rate: 3.353E-05 | global batch size: 512 | lm loss: 1.990917E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.689 | TFLOPs: 23.65 | 63: iteration 20160/ 24424 | consumed samples: 10321920 | consumed tokens: 21139292160 | elapsed time per iteration (s): 2.25 | learning rate: 3.347E-05 | global batch size: 512 | lm loss: 1.999843E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.999 | TFLOPs: 23.47 | 63: iteration 20170/ 24424 | consumed samples: 10327040 | consumed tokens: 21149777920 | elapsed time per iteration (s): 2.23 | learning rate: 3.340E-05 | global batch size: 512 | lm loss: 1.995603E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.094 | TFLOPs: 23.58 | 63: iteration 20180/ 24424 | consumed samples: 10332160 | consumed tokens: 21160263680 | elapsed time per iteration (s): 2.24 | learning rate: 3.334E-05 | global batch size: 512 | lm loss: 1.988558E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.537 | TFLOPs: 23.53 | 63: iteration 20190/ 24424 | consumed samples: 10337280 | consumed tokens: 21170749440 | elapsed time per iteration (s): 2.29 | learning rate: 3.328E-05 | global batch size: 512 | lm loss: 1.986477E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.754 | TFLOPs: 23.03 | 63: iteration 20200/ 24424 | consumed samples: 10342400 | consumed tokens: 21181235200 | elapsed time per iteration (s): 2.27 | learning rate: 3.322E-05 | global batch size: 512 | lm loss: 1.993669E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.154 | TFLOPs: 23.18 | 63: iteration 20210/ 24424 | consumed samples: 10347520 | consumed tokens: 21191720960 | elapsed time per iteration (s): 2.23 | learning rate: 3.316E-05 | global batch size: 512 | lm loss: 1.998279E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.805 | TFLOPs: 23.66 | 63: iteration 20220/ 24424 | consumed samples: 10352640 | consumed tokens: 21202206720 | elapsed time per iteration (s): 2.24 | learning rate: 3.310E-05 | global batch size: 512 | lm loss: 1.995176E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.132 | TFLOPs: 23.49 | 63: iteration 20230/ 24424 | consumed samples: 10357760 | consumed tokens: 21212692480 | elapsed time per iteration (s): 2.23 | learning rate: 3.304E-05 | global batch size: 512 | lm loss: 1.965516E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.089 | TFLOPs: 23.58 | 63: iteration 20240/ 24424 | consumed samples: 10362880 | consumed tokens: 21223178240 | elapsed time per iteration (s): 2.25 | learning rate: 3.298E-05 | global batch size: 512 | lm loss: 2.015002E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.925 | TFLOPs: 23.46 | 63: iteration 20250/ 24424 | consumed samples: 10368000 | consumed tokens: 21233664000 | elapsed time per iteration (s): 2.24 | learning rate: 3.292E-05 | global batch size: 512 | lm loss: 1.990236E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.395 | TFLOPs: 23.51 | 63: iteration 20260/ 24424 | consumed samples: 10373120 | consumed tokens: 21244149760 | elapsed time per iteration (s): 2.29 | learning rate: 3.286E-05 | global batch size: 512 | lm loss: 1.987137E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.836 | TFLOPs: 23.04 | 63: iteration 20270/ 24424 | consumed samples: 10378240 | consumed tokens: 21254635520 | elapsed time per iteration (s): 2.34 | learning rate: 3.280E-05 | global batch size: 512 | lm loss: 2.016546E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.653 | TFLOPs: 22.51 | 63: iteration 20280/ 24424 | consumed samples: 10383360 | consumed tokens: 21265121280 | elapsed time per iteration (s): 2.29 | learning rate: 3.274E-05 | global batch size: 512 | lm loss: 2.003788E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.528 | TFLOPs: 23.01 | 63: iteration 20290/ 24424 | consumed samples: 10388480 | consumed tokens: 21275607040 | elapsed time per iteration (s): 4.27 | learning rate: 3.268E-05 | global batch size: 512 | lm loss: 1.999949E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 119.981 | TFLOPs: 12.35 | 63: iteration 20300/ 24424 | consumed samples: 10393600 | consumed tokens: 21286092800 | elapsed time per iteration (s): 2.27 | learning rate: 3.262E-05 | global batch size: 512 | lm loss: 1.986088E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.352 | TFLOPs: 23.20 | 63: iteration 20310/ 24424 | consumed samples: 10398720 | consumed tokens: 21296578560 | elapsed time per iteration (s): 2.23 | learning rate: 3.256E-05 | global batch size: 512 | lm loss: 1.991812E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.273 | TFLOPs: 23.60 | 63: iteration 20320/ 24424 | consumed samples: 10403840 | consumed tokens: 21307064320 | elapsed time per iteration (s): 2.26 | learning rate: 3.250E-05 | global batch size: 512 | lm loss: 1.995908E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.023 | TFLOPs: 23.37 | 63: iteration 20330/ 24424 | consumed samples: 10408960 | consumed tokens: 21317550080 | elapsed time per iteration (s): 2.25 | learning rate: 3.244E-05 | global batch size: 512 | lm loss: 2.001609E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.319 | TFLOPs: 23.40 | 63: iteration 20340/ 24424 | consumed samples: 10414080 | consumed tokens: 21328035840 | elapsed time per iteration (s): 2.25 | learning rate: 3.238E-05 | global batch size: 512 | lm loss: 1.982644E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.062 | TFLOPs: 23.48 | 63: iteration 20350/ 24424 | consumed samples: 10419200 | consumed tokens: 21338521600 | elapsed time per iteration (s): 2.24 | learning rate: 3.232E-05 | global batch size: 512 | lm loss: 1.982344E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.503 | TFLOPs: 23.52 | 63: iteration 20360/ 24424 | consumed samples: 10424320 | consumed tokens: 21349007360 | elapsed time per iteration (s): 2.26 | learning rate: 3.226E-05 | global batch size: 512 | lm loss: 1.983891E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.098 | TFLOPs: 23.28 | 63: iteration 20370/ 24424 | consumed samples: 10429440 | consumed tokens: 21359493120 | elapsed time per iteration (s): 2.30 | learning rate: 3.220E-05 | global batch size: 512 | lm loss: 1.995528E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.427 | TFLOPs: 22.90 | 63: iteration 20380/ 24424 | consumed samples: 10434560 | consumed tokens: 21369978880 | elapsed time per iteration (s): 2.44 | learning rate: 3.214E-05 | global batch size: 512 | lm loss: 1.975126E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 209.789 | TFLOPs: 21.60 | 63: iteration 20390/ 24424 | consumed samples: 10439680 | consumed tokens: 21380464640 | elapsed time per iteration (s): 2.31 | learning rate: 3.209E-05 | global batch size: 512 | lm loss: 1.971450E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.626 | TFLOPs: 22.82 | 63: iteration 20400/ 24424 | consumed samples: 10444800 | consumed tokens: 21390950400 | elapsed time per iteration (s): 2.25 | learning rate: 3.203E-05 | global batch size: 512 | lm loss: 2.001950E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.063 | TFLOPs: 23.38 | 63: iteration 20410/ 24424 | consumed samples: 10449920 | consumed tokens: 21401436160 | elapsed time per iteration (s): 2.25 | learning rate: 3.197E-05 | global batch size: 512 | lm loss: 1.981256E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.510 | TFLOPs: 23.42 | 63: iteration 20420/ 24424 | consumed samples: 10455040 | consumed tokens: 21411921920 | elapsed time per iteration (s): 2.38 | learning rate: 3.191E-05 | global batch size: 512 | lm loss: 1.997199E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 215.131 | TFLOPs: 22.15 | 63: iteration 20430/ 24424 | consumed samples: 10460160 | consumed tokens: 21422407680 | elapsed time per iteration (s): 2.25 | learning rate: 3.185E-05 | global batch size: 512 | lm loss: 1.986417E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.140 | TFLOPs: 23.38 | 63: iteration 20440/ 24424 | consumed samples: 10465280 | consumed tokens: 21432893440 | elapsed time per iteration (s): 2.25 | learning rate: 3.179E-05 | global batch size: 512 | lm loss: 1.998437E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.277 | TFLOPs: 23.40 | 63: iteration 20450/ 24424 | consumed samples: 10470400 | consumed tokens: 21443379200 | elapsed time per iteration (s): 2.58 | learning rate: 3.174E-05 | global batch size: 512 | lm loss: 1.984074E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 198.144 | TFLOPs: 20.40 | 63: iteration 20460/ 24424 | consumed samples: 10475520 | consumed tokens: 21453864960 | elapsed time per iteration (s): 2.23 | learning rate: 3.168E-05 | global batch size: 512 | lm loss: 1.993967E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.267 | TFLOPs: 23.60 | 63: iteration 20470/ 24424 | consumed samples: 10480640 | consumed tokens: 21464350720 | elapsed time per iteration (s): 2.23 | learning rate: 3.162E-05 | global batch size: 512 | lm loss: 1.989212E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.543 | TFLOPs: 23.63 | 63: iteration 20480/ 24424 | consumed samples: 10485760 | consumed tokens: 21474836480 | elapsed time per iteration (s): 2.25 | learning rate: 3.156E-05 | global batch size: 512 | lm loss: 2.009028E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.441 | TFLOPs: 23.41 | 63: iteration 20490/ 24424 | consumed samples: 10490880 | consumed tokens: 21485322240 | elapsed time per iteration (s): 2.26 | learning rate: 3.151E-05 | global batch size: 512 | lm loss: 1.971875E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.404 | TFLOPs: 23.31 | 63: iteration 20500/ 24424 | consumed samples: 10496000 | consumed tokens: 21495808000 | elapsed time per iteration (s): 2.24 | learning rate: 3.145E-05 | global batch size: 512 | lm loss: 1.992772E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.766 | TFLOPs: 23.55 | 63: iteration 20510/ 24424 | consumed samples: 10501120 | consumed tokens: 21506293760 | elapsed time per iteration (s): 2.26 | learning rate: 3.139E-05 | global batch size: 512 | lm loss: 2.002171E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.152 | TFLOPs: 23.28 | 63: iteration 20520/ 24424 | consumed samples: 10506240 | consumed tokens: 21516779520 | elapsed time per iteration (s): 2.26 | learning rate: 3.134E-05 | global batch size: 512 | lm loss: 1.997963E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.279 | TFLOPs: 23.29 | 63: iteration 20530/ 24424 | consumed samples: 10511360 | consumed tokens: 21527265280 | elapsed time per iteration (s): 2.24 | learning rate: 3.128E-05 | global batch size: 512 | lm loss: 1.979209E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.919 | TFLOPs: 23.57 | 63: iteration 20540/ 24424 | consumed samples: 10516480 | consumed tokens: 21537751040 | elapsed time per iteration (s): 2.28 | learning rate: 3.122E-05 | global batch size: 512 | lm loss: 2.003198E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.763 | TFLOPs: 23.14 | 63: iteration 20550/ 24424 | consumed samples: 10521600 | consumed tokens: 21548236800 | elapsed time per iteration (s): 2.27 | learning rate: 3.117E-05 | global batch size: 512 | lm loss: 1.967993E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.493 | TFLOPs: 23.21 | 63: iteration 20560/ 24424 | consumed samples: 10526720 | consumed tokens: 21558722560 | elapsed time per iteration (s): 2.25 | learning rate: 3.111E-05 | global batch size: 512 | lm loss: 1.988297E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.441 | TFLOPs: 23.41 | 63: iteration 20570/ 24424 | consumed samples: 10531840 | consumed tokens: 21569208320 | elapsed time per iteration (s): 4.03 | learning rate: 3.105E-05 | global batch size: 512 | lm loss: 2.000764E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 127.083 | TFLOPs: 13.08 | 63: iteration 20580/ 24424 | consumed samples: 10536960 | consumed tokens: 21579694080 | elapsed time per iteration (s): 2.28 | learning rate: 3.100E-05 | global batch size: 512 | lm loss: 2.007188E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.129 | TFLOPs: 23.07 | 63: iteration 20590/ 24424 | consumed samples: 10542080 | consumed tokens: 21590179840 | elapsed time per iteration (s): 3.38 | learning rate: 3.094E-05 | global batch size: 512 | lm loss: 1.960085E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 151.364 | TFLOPs: 15.58 | 63: iteration 20600/ 24424 | consumed samples: 10547200 | consumed tokens: 21600665600 | elapsed time per iteration (s): 2.26 | learning rate: 3.089E-05 | global batch size: 512 | lm loss: 1.977076E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.833 | TFLOPs: 23.35 | 63: iteration 20610/ 24424 | consumed samples: 10552320 | consumed tokens: 21611151360 | elapsed time per iteration (s): 2.24 | learning rate: 3.083E-05 | global batch size: 512 | lm loss: 1.994206E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.932 | TFLOPs: 23.57 | 63: iteration 20620/ 24424 | consumed samples: 10557440 | consumed tokens: 21621637120 | elapsed time per iteration (s): 2.22 | learning rate: 3.077E-05 | global batch size: 512 | lm loss: 1.988724E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.319 | TFLOPs: 23.71 | 63: iteration 20630/ 24424 | consumed samples: 10562560 | consumed tokens: 21632122880 | elapsed time per iteration (s): 2.24 | learning rate: 3.072E-05 | global batch size: 512 | lm loss: 1.980180E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.446 | TFLOPs: 23.52 | 63: iteration 20640/ 24424 | consumed samples: 10567680 | consumed tokens: 21642608640 | elapsed time per iteration (s): 2.25 | learning rate: 3.066E-05 | global batch size: 512 | lm loss: 1.995571E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.243 | TFLOPs: 23.39 | 63: iteration 20650/ 24424 | consumed samples: 10572800 | consumed tokens: 21653094400 | elapsed time per iteration (s): 2.26 | learning rate: 3.061E-05 | global batch size: 512 | lm loss: 1.984645E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.435 | TFLOPs: 23.31 | 63: iteration 20660/ 24424 | consumed samples: 10577920 | consumed tokens: 21663580160 | elapsed time per iteration (s): 2.23 | learning rate: 3.055E-05 | global batch size: 512 | lm loss: 2.001151E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.544 | TFLOPs: 23.63 | 63: iteration 20670/ 24424 | consumed samples: 10583040 | consumed tokens: 21674065920 | elapsed time per iteration (s): 2.23 | learning rate: 3.050E-05 | global batch size: 512 | lm loss: 1.994888E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.111 | TFLOPs: 23.69 | 63: iteration 20680/ 24424 | consumed samples: 10588160 | consumed tokens: 21684551680 | elapsed time per iteration (s): 2.28 | learning rate: 3.044E-05 | global batch size: 512 | lm loss: 1.991166E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.073 | TFLOPs: 23.07 | 63: iteration 20690/ 24424 | consumed samples: 10593280 | consumed tokens: 21695037440 | elapsed time per iteration (s): 2.30 | learning rate: 3.039E-05 | global batch size: 512 | lm loss: 1.980704E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.629 | TFLOPs: 22.92 | 63: iteration 20700/ 24424 | consumed samples: 10598400 | consumed tokens: 21705523200 | elapsed time per iteration (s): 2.28 | learning rate: 3.033E-05 | global batch size: 512 | lm loss: 1.976253E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.621 | TFLOPs: 23.12 | 63: iteration 20710/ 24424 | consumed samples: 10603520 | consumed tokens: 21716008960 | elapsed time per iteration (s): 2.29 | learning rate: 3.028E-05 | global batch size: 512 | lm loss: 1.975532E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.480 | TFLOPs: 23.01 | 63: iteration 20720/ 24424 | consumed samples: 10608640 | consumed tokens: 21726494720 | elapsed time per iteration (s): 2.41 | learning rate: 3.023E-05 | global batch size: 512 | lm loss: 1.977561E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 212.684 | TFLOPs: 21.89 | 63: iteration 20730/ 24424 | consumed samples: 10613760 | consumed tokens: 21736980480 | elapsed time per iteration (s): 2.25 | learning rate: 3.017E-05 | global batch size: 512 | lm loss: 1.977356E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.563 | TFLOPs: 23.43 | 63: iteration 20740/ 24424 | consumed samples: 10618880 | consumed tokens: 21747466240 | elapsed time per iteration (s): 4.91 | learning rate: 3.012E-05 | global batch size: 512 | lm loss: 1.981004E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 104.245 | TFLOPs: 10.73 | 63: iteration 20750/ 24424 | consumed samples: 10624000 | consumed tokens: 21757952000 | elapsed time per iteration (s): 2.28 | learning rate: 3.006E-05 | global batch size: 512 | lm loss: 2.006087E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.128 | TFLOPs: 23.07 | 63: iteration 20760/ 24424 | consumed samples: 10629120 | consumed tokens: 21768437760 | elapsed time per iteration (s): 2.24 | learning rate: 3.001E-05 | global batch size: 512 | lm loss: 2.007908E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.912 | TFLOPs: 23.57 | 63: iteration 20770/ 24424 | consumed samples: 10634240 | consumed tokens: 21778923520 | elapsed time per iteration (s): 2.23 | learning rate: 2.996E-05 | global batch size: 512 | lm loss: 2.011560E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.365 | TFLOPs: 23.61 | 63: iteration 20780/ 24424 | consumed samples: 10639360 | consumed tokens: 21789409280 | elapsed time per iteration (s): 2.27 | learning rate: 2.990E-05 | global batch size: 512 | lm loss: 1.991236E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.456 | TFLOPs: 23.21 | 63: iteration 20790/ 24424 | consumed samples: 10644480 | consumed tokens: 21799895040 | elapsed time per iteration (s): 2.24 | learning rate: 2.985E-05 | global batch size: 512 | lm loss: 1.970356E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.281 | TFLOPs: 23.50 | 63: iteration 20800/ 24424 | consumed samples: 10649600 | consumed tokens: 21810380800 | elapsed time per iteration (s): 2.23 | learning rate: 2.980E-05 | global batch size: 512 | lm loss: 1.994013E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.636 | TFLOPs: 23.64 | 63: iteration 20810/ 24424 | consumed samples: 10654720 | consumed tokens: 21820866560 | elapsed time per iteration (s): 2.23 | learning rate: 2.974E-05 | global batch size: 512 | lm loss: 1.983203E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.962 | TFLOPs: 23.67 | 63: iteration 20820/ 24424 | consumed samples: 10659840 | consumed tokens: 21831352320 | elapsed time per iteration (s): 2.25 | learning rate: 2.969E-05 | global batch size: 512 | lm loss: 1.987993E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.909 | TFLOPs: 23.46 | 63: iteration 20830/ 24424 | consumed samples: 10664960 | consumed tokens: 21841838080 | elapsed time per iteration (s): 2.23 | learning rate: 2.964E-05 | global batch size: 512 | lm loss: 2.004704E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.607 | TFLOPs: 23.64 | 63: iteration 20840/ 24424 | consumed samples: 10670080 | consumed tokens: 21852323840 | elapsed time per iteration (s): 2.24 | learning rate: 2.959E-05 | global batch size: 512 | lm loss: 2.000067E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.560 | TFLOPs: 23.53 | 63: iteration 20850/ 24424 | consumed samples: 10675200 | consumed tokens: 21862809600 | elapsed time per iteration (s): 2.23 | learning rate: 2.953E-05 | global batch size: 512 | lm loss: 2.002296E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.156 | TFLOPs: 23.59 | 63: iteration 20860/ 24424 | consumed samples: 10680320 | consumed tokens: 21873295360 | elapsed time per iteration (s): 2.25 | learning rate: 2.948E-05 | global batch size: 512 | lm loss: 1.987431E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.559 | TFLOPs: 23.43 | 63: iteration 20870/ 24424 | consumed samples: 10685440 | consumed tokens: 21883781120 | elapsed time per iteration (s): 2.23 | learning rate: 2.943E-05 | global batch size: 512 | lm loss: 1.999490E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.348 | TFLOPs: 23.61 | 63: iteration 20880/ 24424 | consumed samples: 10690560 | consumed tokens: 21894266880 | elapsed time per iteration (s): 2.24 | learning rate: 2.938E-05 | global batch size: 512 | lm loss: 1.998443E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.167 | TFLOPs: 23.49 | 63: iteration 20890/ 24424 | consumed samples: 10695680 | consumed tokens: 21904752640 | elapsed time per iteration (s): 2.26 | learning rate: 2.933E-05 | global batch size: 512 | lm loss: 2.002202E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.605 | TFLOPs: 23.33 | 63: iteration 20900/ 24424 | consumed samples: 10700800 | consumed tokens: 21915238400 | elapsed time per iteration (s): 2.25 | learning rate: 2.927E-05 | global batch size: 512 | lm loss: 1.974748E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.193 | TFLOPs: 23.39 | 63: iteration 20910/ 24424 | consumed samples: 10705920 | consumed tokens: 21925724160 | elapsed time per iteration (s): 2.25 | learning rate: 2.922E-05 | global batch size: 512 | lm loss: 1.987656E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.926 | TFLOPs: 23.46 | 63: iteration 20920/ 24424 | consumed samples: 10711040 | consumed tokens: 21936209920 | elapsed time per iteration (s): 2.24 | learning rate: 2.917E-05 | global batch size: 512 | lm loss: 1.981467E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.462 | TFLOPs: 23.52 | 63: iteration 20930/ 24424 | consumed samples: 10716160 | consumed tokens: 21946695680 | elapsed time per iteration (s): 2.24 | learning rate: 2.912E-05 | global batch size: 512 | lm loss: 1.989944E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.435 | TFLOPs: 23.52 | 63: iteration 20940/ 24424 | consumed samples: 10721280 | consumed tokens: 21957181440 | elapsed time per iteration (s): 2.26 | learning rate: 2.907E-05 | global batch size: 512 | lm loss: 1.991302E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.806 | TFLOPs: 23.35 | 63: iteration 20950/ 24424 | consumed samples: 10726400 | consumed tokens: 21967667200 | elapsed time per iteration (s): 2.23 | learning rate: 2.902E-05 | global batch size: 512 | lm loss: 2.004341E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.573 | TFLOPs: 23.63 | 63: iteration 20960/ 24424 | consumed samples: 10731520 | consumed tokens: 21978152960 | elapsed time per iteration (s): 2.27 | learning rate: 2.897E-05 | global batch size: 512 | lm loss: 1.994126E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.795 | TFLOPs: 23.24 | 63: iteration 20970/ 24424 | consumed samples: 10736640 | consumed tokens: 21988638720 | elapsed time per iteration (s): 2.28 | learning rate: 2.891E-05 | global batch size: 512 | lm loss: 1.978738E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.156 | TFLOPs: 23.08 | 63: iteration 20980/ 24424 | consumed samples: 10741760 | consumed tokens: 21999124480 | elapsed time per iteration (s): 2.25 | learning rate: 2.886E-05 | global batch size: 512 | lm loss: 2.004431E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.258 | TFLOPs: 23.40 | 63: iteration 20990/ 24424 | consumed samples: 10746880 | consumed tokens: 22009610240 | elapsed time per iteration (s): 2.25 | learning rate: 2.881E-05 | global batch size: 512 | lm loss: 1.982720E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.956 | TFLOPs: 23.47 | 63: iteration 21000/ 24424 | consumed samples: 10752000 | consumed tokens: 22020096000 | elapsed time per iteration (s): 2.24 | learning rate: 2.876E-05 | global batch size: 512 | lm loss: 1.982060E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.069 | TFLOPs: 23.58 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 21000 | lm loss value: 1.935982E+00 | lm loss PPL: 6.930844E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 21000 to checkpoints_3b9 0: [2022-11-26 07:19:40,287] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step21000 is begin to save! 0: [2022-11-26 07:19:40,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_01-model_00-model_states.pt... 32: [2022-11-26 07:19:40,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_21-model_00-model_states.pt... 32: [2022-11-26 07:19:40,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_21-model_00-model_states.pt. 32: [2022-11-26 07:19:40,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_22-model_00-model_states.pt... 0: [2022-11-26 07:19:40,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_01-model_00-model_states.pt. 0: [2022-11-26 07:19:40,678] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_03-model_00-model_states.pt... 32: [2022-11-26 07:19:40,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_22-model_00-model_states.pt. 32: [2022-11-26 07:19:40,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_23-model_00-model_states.pt... 0: [2022-11-26 07:19:40,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_03-model_00-model_states.pt. 0: [2022-11-26 07:19:40,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_04-model_00-model_states.pt... 32: [2022-11-26 07:19:41,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_23-model_00-model_states.pt. 32: [2022-11-26 07:19:41,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_24-model_00-model_states.pt... 0: [2022-11-26 07:19:41,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_04-model_00-model_states.pt. 0: [2022-11-26 07:19:41,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_05-model_00-model_states.pt... 32: [2022-11-26 07:19:41,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_24-model_00-model_states.pt. 32: [2022-11-26 07:19:41,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_25-model_00-model_states.pt... 0: [2022-11-26 07:19:41,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_05-model_00-model_states.pt. 0: [2022-11-26 07:19:41,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_06-model_00-model_states.pt... 32: [2022-11-26 07:19:41,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_25-model_00-model_states.pt. 32: [2022-11-26 07:19:41,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_26-model_00-model_states.pt... 0: [2022-11-26 07:19:41,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_06-model_00-model_states.pt. 0: [2022-11-26 07:19:41,631] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_07-model_00-model_states.pt... 32: [2022-11-26 07:19:41,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_26-model_00-model_states.pt. 32: [2022-11-26 07:19:41,749] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_27-model_00-model_states.pt... 0: [2022-11-26 07:19:41,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_07-model_00-model_states.pt. 0: [2022-11-26 07:19:41,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_08-model_00-model_states.pt... 32: [2022-11-26 07:19:41,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_27-model_00-model_states.pt. 32: [2022-11-26 07:19:41,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_28-model_00-model_states.pt... 0: [2022-11-26 07:19:42,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_08-model_00-model_states.pt. 0: [2022-11-26 07:19:42,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_09-model_00-model_states.pt... 32: [2022-11-26 07:19:42,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_28-model_00-model_states.pt. 32: [2022-11-26 07:19:42,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_29-model_00-model_states.pt... 0: [2022-11-26 07:19:42,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_09-model_00-model_states.pt. 0: [2022-11-26 07:19:42,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_10-model_00-model_states.pt... 32: [2022-11-26 07:19:42,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_29-model_00-model_states.pt. 32: [2022-11-26 07:19:42,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_30-model_00-model_states.pt... 0: [2022-11-26 07:19:42,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_10-model_00-model_states.pt. 0: [2022-11-26 07:19:42,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_11-model_00-model_states.pt... 32: [2022-11-26 07:19:42,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_30-model_00-model_states.pt. 32: [2022-11-26 07:19:42,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_31-model_00-model_states.pt... 0: [2022-11-26 07:19:42,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_11-model_00-model_states.pt. 0: [2022-11-26 07:19:42,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_12-model_00-model_states.pt... 32: [2022-11-26 07:19:42,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_31-model_00-model_states.pt. 32: [2022-11-26 07:19:42,906] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_32-model_00-model_states.pt... 0: [2022-11-26 07:19:43,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_12-model_00-model_states.pt. 0: [2022-11-26 07:19:43,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_13-model_00-model_states.pt... 32: [2022-11-26 07:19:43,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_32-model_00-model_states.pt. 32: [2022-11-26 07:19:43,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_33-model_00-model_states.pt... 0: [2022-11-26 07:19:43,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_13-model_00-model_states.pt. 0: [2022-11-26 07:19:43,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_14-model_00-model_states.pt... 32: [2022-11-26 07:19:43,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_33-model_00-model_states.pt. 32: [2022-11-26 07:19:43,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_34-model_00-model_states.pt... 0: [2022-11-26 07:19:43,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_14-model_00-model_states.pt. 0: [2022-11-26 07:19:43,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_15-model_00-model_states.pt... 32: [2022-11-26 07:19:43,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_34-model_00-model_states.pt. 32: [2022-11-26 07:19:43,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_35-model_00-model_states.pt... 0: [2022-11-26 07:19:43,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_15-model_00-model_states.pt. 0: [2022-11-26 07:19:43,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_16-model_00-model_states.pt... 32: [2022-11-26 07:19:43,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_35-model_00-model_states.pt. 32: [2022-11-26 07:19:43,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_36-model_00-model_states.pt... 0: [2022-11-26 07:19:43,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_16-model_00-model_states.pt. 0: [2022-11-26 07:19:43,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_17-model_00-model_states.pt... 32: [2022-11-26 07:19:44,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_36-model_00-model_states.pt. 32: [2022-11-26 07:19:44,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_37-model_00-model_states.pt... 0: [2022-11-26 07:19:44,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_17-model_00-model_states.pt. 0: [2022-11-26 07:19:44,213] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_18-model_00-model_states.pt... 32: [2022-11-26 07:19:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_37-model_00-model_states.pt. 32: [2022-11-26 07:19:44,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_38-model_00-model_states.pt... 0: [2022-11-26 07:19:44,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_18-model_00-model_states.pt. 0: [2022-11-26 07:19:44,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_19-model_00-model_states.pt... 32: [2022-11-26 07:19:44,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_38-model_00-model_states.pt. 32: [2022-11-26 07:19:44,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_40-model_00-model_states.pt... 32: [2022-11-26 07:19:44,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_40-model_00-model_states.pt. 32: [2022-11-26 07:19:44,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/mp_rank_01_model_states.pt... 32: [2022-11-26 07:19:44,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/mp_rank_01_model_states.pt. 0: [2022-11-26 07:19:44,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_19-model_00-model_states.pt. 0: [2022-11-26 07:19:44,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/layer_20-model_00-model_states.pt... 0: [2022-11-26 07:19:44,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/layer_20-model_00-model_states.pt. 0: [2022-11-26 07:19:44,900] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step21000/mp_rank_00_model_states.pt 0: [2022-11-26 07:19:44,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/mp_rank_00_model_states.pt... 0: [2022-11-26 07:19:44,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/mp_rank_00_model_states.pt. 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 52: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 33: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 40: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 54: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:19:45,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step21000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:19:45,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 34: [2022-11-26 07:19:45,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:19:45,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 3: [2022-11-26 07:19:45,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 34: [2022-11-26 07:19:45,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 07:19:45,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:19:45,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:19:45,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:19:45,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 19: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 35: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:19:45,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:19:45,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 26: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 46: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 0: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 41: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 49: [2022-11-26 07:19:45,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 0: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 07:19:45,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 29: [2022-11-26 07:19:45,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 15: [2022-11-26 07:19:45,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:19:45,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 15: [2022-11-26 07:19:45,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 40: [2022-11-26 07:19:45,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:19:45,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 07:19:45,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:19:45,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 59: [2022-11-26 07:19:45,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 24: [2022-11-26 07:19:45,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 59: [2022-11-26 07:19:45,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 11: [2022-11-26 07:19:45,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:19:45,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 07:19:45,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 07:19:45,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 52: [2022-11-26 07:19:45,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 9: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 35: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:19:45,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 07:19:45,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:19:45,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 41: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 28: [2022-11-26 07:19:45,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 62: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:19:45,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:19:45,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:19:45,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 42: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 14: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:19:45,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 34: [2022-11-26 07:19:45,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 22: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 0: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 34: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 46: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 7: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 46: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 7: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 59: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 52: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 4: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 52: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 47: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 8: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 07:19:45,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 33: [2022-11-26 07:19:45,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 19: [2022-11-26 07:19:45,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:19:45,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 4: [2022-11-26 07:19:45,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:19:45,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 44: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 07:19:45,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 12: [2022-11-26 07:19:45,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 44: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 13: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 39: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 18: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 39: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 62: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 49: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:19:45,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 52: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 3: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 52: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 22: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 52: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 11: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 21: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 58: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 21: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 31: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 48: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 11: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 48: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 55: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 11: [2022-11-26 07:19:45,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 48: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 55: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 11: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 48: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:19:45,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:19:45,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:19:45,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:19:45,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 7: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 47: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 3: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 47: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 14: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 21: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 63: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 63: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:19:45,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 28: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:19:45,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 28: [2022-11-26 07:19:45,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:19:45,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 32: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 38: [2022-11-26 07:19:45,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 15: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 07:19:45,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:19:45,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 07:19:45,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:19:45,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 07:19:45,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 54: [2022-11-26 07:19:45,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 07:19:45,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 07:19:45,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 4: [2022-11-26 07:19:45,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 25: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 63: [2022-11-26 07:19:45,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 25: [2022-11-26 07:19:45,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 63: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 07:19:45,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:19:45,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 07:19:45,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 07:19:45,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:19:45,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:19:45,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:19:45,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 55: [2022-11-26 07:19:45,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 26: [2022-11-26 07:19:45,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 55: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 26: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:19:45,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:19:45,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 07:19:45,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 07:19:45,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:19:45,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 34: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 6: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 59: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:19:45,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 07:19:45,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:19:45,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:19:45,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:19:45,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 07:19:45,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:19:45,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 07:19:45,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 07:19:45,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:19:45,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 07:19:45,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:19:45,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 07:19:45,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:19:45,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:19:45,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 07:19:45,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 07:19:45,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 07:19:45,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:19:45,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 07:19:45,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:19:45,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 07:19:45,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 11: [2022-11-26 07:19:45,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:19:45,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 07:19:45,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:19:45,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 07:19:45,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:19:45,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:19:45,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:19:45,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 07:19:45,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:19:45,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 07:19:45,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 07:19:45,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:19:45,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:19:45,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 07:19:45,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:19:45,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 07:19:45,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:19:45,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:19:45,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:19:45,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 07:19:45,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 07:19:45,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 62: [2022-11-26 07:19:45,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 07:19:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:19:45,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:19:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 07:19:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:19:45,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 07:19:45,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:19:45,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:19:45,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:19:45,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:19:45,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 07:19:45,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:19:45,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 07:19:45,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 07:19:45,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:19:45,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:19:45,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 07:19:45,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 32: [2022-11-26 07:19:45,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 27: [2022-11-26 07:19:45,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 32: [2022-11-26 07:19:45,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 27: [2022-11-26 07:19:45,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:19:45,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:19:45,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 59: [2022-11-26 07:19:45,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:19:45,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:19:45,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:19:45,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 13: [2022-11-26 07:19:45,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:19:45,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 07:19:45,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:19:45,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 07:19:45,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 11: [2022-11-26 07:19:45,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:19:45,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 07:19:45,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:19:45,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 15: [2022-11-26 07:19:45,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 07:19:45,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:19:45,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 07:19:45,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:19:45,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 07:19:45,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:19:45,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:19:45,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 07:19:45,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:19:45,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 07:19:45,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:19:45,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 07:19:45,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:19:45,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:19:45,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:19:45,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 07:19:45,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 07:19:45,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:19:45,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 42: [2022-11-26 07:19:45,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 2: [2022-11-26 07:19:45,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 59: [2022-11-26 07:19:45,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:19:45,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 27: [2022-11-26 07:19:45,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 62: [2022-11-26 07:19:45,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 27: [2022-11-26 07:19:45,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 62: [2022-11-26 07:19:45,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:19:45,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 07:19:45,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 07:19:45,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 43: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 21: [2022-11-26 07:19:45,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 43: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 07:19:45,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 07:19:45,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:19:45,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:19:45,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 13: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 58: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:19:45,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 11: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 11: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 50: [2022-11-26 07:19:45,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 11: [2022-11-26 07:19:45,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 07:19:45,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:19:45,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:19:45,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:19:45,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 07:19:45,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:19:45,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 07:19:45,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:19:45,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 07:19:45,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 07:19:45,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:19:45,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 07:19:45,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:19:45,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:19:45,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 25: [2022-11-26 07:19:45,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 15: [2022-11-26 07:19:45,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:19:45,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 07:19:45,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:19:45,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 07:19:45,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 36: [2022-11-26 07:19:45,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:19:45,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 4: [2022-11-26 07:19:45,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 07:19:45,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 62: [2022-11-26 07:19:45,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:19:45,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:19:45,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 07:19:45,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 07:19:45,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:19:45,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:19:45,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:19:45,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:19:45,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 07:19:45,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:19:45,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:19:45,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:19:45,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:19:45,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 07:19:45,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 47: [2022-11-26 07:19:45,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 21: [2022-11-26 07:19:45,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:19:45,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 47: [2022-11-26 07:19:45,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 23: [2022-11-26 07:19:45,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 07:19:45,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:19:45,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 58: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 9: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:19:45,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 59: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 29: [2022-11-26 07:19:45,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 59: [2022-11-26 07:19:45,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 07:19:45,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 27: [2022-11-26 07:19:45,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:19:45,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 61: [2022-11-26 07:19:45,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 61: [2022-11-26 07:19:45,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:19:45,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:19:45,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 20: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 37: [2022-11-26 07:19:45,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 07:19:45,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:19:45,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 07:19:45,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:19:45,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:19:45,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 07:19:45,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:19:45,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:19:45,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 52: [2022-11-26 07:19:45,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:19:45,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 07:19:45,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:19:45,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:19:45,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:19:45,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 62: [2022-11-26 07:19:45,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 07:19:45,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 56: [2022-11-26 07:19:45,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:19:45,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:19:45,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 07:19:45,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 07:19:45,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:19:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 07:19:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:19:45,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 34: [2022-11-26 07:19:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:19:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:19:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 07:19:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 35: [2022-11-26 07:19:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 59: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 6: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 59: [2022-11-26 07:19:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 6: [2022-11-26 07:19:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 59: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 6: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 43: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:19:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 07:19:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 47: [2022-11-26 07:19:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 30: [2022-11-26 07:19:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 47: [2022-11-26 07:19:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 07:19:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 30: [2022-11-26 07:19:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 07:19:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 29: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 54: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 29: [2022-11-26 07:19:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 54: [2022-11-26 07:19:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 41: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:19:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 53: [2022-11-26 07:19:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:19:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 07:19:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 28: [2022-11-26 07:19:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:19:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 07:19:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 3: [2022-11-26 07:19:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:19:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 07:19:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 20: [2022-11-26 07:19:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 40: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 20: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 40: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 16: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 16: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 18: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 25: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 44: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 25: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 44: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 36: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 25: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 44: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 36: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 57: [2022-11-26 07:19:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 07:19:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 10: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:19:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 9: [2022-11-26 07:19:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 10: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 49: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:19:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 19: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:19:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 14: [2022-11-26 07:19:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 9: [2022-11-26 07:19:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 26: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 21: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 61: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 21: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 23: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 60: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 23: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 1: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 58: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 23: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 58: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 5: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:19:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 58: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 1: [2022-11-26 07:19:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 5: [2022-11-26 07:19:45,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 13: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:19:45,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:19:45,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 63: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 22: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 63: [2022-11-26 07:19:45,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 07:19:45,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 22: [2022-11-26 07:19:45,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 55: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 4: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 4: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 55: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 27: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 55: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 24: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 27: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 24: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 37: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:19:45,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 2: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 48: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 56: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 48: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 2: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 51: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 2: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 51: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 7: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 31: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 42: [2022-11-26 07:19:45,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:19:45,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 07:19:45,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 32: [2022-11-26 07:19:45,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:19:45,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 12: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 33: [2022-11-26 07:19:45,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 12: [2022-11-26 07:19:45,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 46: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 12: [2022-11-26 07:19:45,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 8: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 46: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 8: [2022-11-26 07:19:45,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 8: [2022-11-26 07:19:45,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: [2022-11-26 07:19:45,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:19:45,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 07:19:45,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 45: [2022-11-26 07:19:45,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:19:45,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 07:19:45,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 11: [2022-11-26 07:19:45,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:19:45,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:19:45,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 17: [2022-11-26 07:19:45,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 07:19:45,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 15: [2022-11-26 07:19:45,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:19:45,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 07:19:45,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 50: [2022-11-26 07:19:45,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:19:45,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 07:19:45,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 33: [2022-11-26 07:19:45,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:19:45,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 07:19:45,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 38: [2022-11-26 07:19:45,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:19:45,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 07:19:45,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 39: [2022-11-26 07:19:45,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:19:45,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step21000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 07:19:45,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! 0: successfully saved checkpoint at iteration 21000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5357.80 63: iteration 21010/ 24424 | consumed samples: 10757120 | consumed tokens: 22030581760 | elapsed time per iteration (s): 2.83 | learning rate: 2.871E-05 | global batch size: 512 | lm loss: 1.995243E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.868 | TFLOPs: 18.62 | 63: iteration 21020/ 24424 | consumed samples: 10762240 | consumed tokens: 22041067520 | elapsed time per iteration (s): 2.25 | learning rate: 2.866E-05 | global batch size: 512 | lm loss: 1.993811E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.981 | TFLOPs: 23.47 | 63: iteration 21030/ 24424 | consumed samples: 10767360 | consumed tokens: 22051553280 | elapsed time per iteration (s): 2.34 | learning rate: 2.861E-05 | global batch size: 512 | lm loss: 1.995037E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.489 | TFLOPs: 22.49 | 63: iteration 21040/ 24424 | consumed samples: 10772480 | consumed tokens: 22062039040 | elapsed time per iteration (s): 2.24 | learning rate: 2.856E-05 | global batch size: 512 | lm loss: 1.993725E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.757 | TFLOPs: 23.55 | 63: iteration 21050/ 24424 | consumed samples: 10777600 | consumed tokens: 22072524800 | elapsed time per iteration (s): 2.23 | learning rate: 2.851E-05 | global batch size: 512 | lm loss: 1.996465E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.550 | TFLOPs: 23.63 | 63: iteration 21060/ 24424 | consumed samples: 10782720 | consumed tokens: 22083010560 | elapsed time per iteration (s): 2.28 | learning rate: 2.846E-05 | global batch size: 512 | lm loss: 1.993684E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.371 | TFLOPs: 23.10 | 63: iteration 21070/ 24424 | consumed samples: 10787840 | consumed tokens: 22093496320 | elapsed time per iteration (s): 2.28 | learning rate: 2.841E-05 | global batch size: 512 | lm loss: 1.993421E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.672 | TFLOPs: 23.13 | 63: iteration 21080/ 24424 | consumed samples: 10792960 | consumed tokens: 22103982080 | elapsed time per iteration (s): 2.25 | learning rate: 2.837E-05 | global batch size: 512 | lm loss: 1.997399E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.150 | TFLOPs: 23.38 | 63: iteration 21090/ 24424 | consumed samples: 10798080 | consumed tokens: 22114467840 | elapsed time per iteration (s): 2.26 | learning rate: 2.832E-05 | global batch size: 512 | lm loss: 1.983421E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.005 | TFLOPs: 23.37 | 63: iteration 21100/ 24424 | consumed samples: 10803200 | consumed tokens: 22124953600 | elapsed time per iteration (s): 2.23 | learning rate: 2.827E-05 | global batch size: 512 | lm loss: 2.003279E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.417 | TFLOPs: 23.62 | 63: iteration 21110/ 24424 | consumed samples: 10808320 | consumed tokens: 22135439360 | elapsed time per iteration (s): 2.23 | learning rate: 2.822E-05 | global batch size: 512 | lm loss: 1.971251E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.362 | TFLOPs: 23.61 | 63: iteration 21120/ 24424 | consumed samples: 10813440 | consumed tokens: 22145925120 | elapsed time per iteration (s): 2.23 | learning rate: 2.817E-05 | global batch size: 512 | lm loss: 1.973057E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.519 | TFLOPs: 23.63 | 63: iteration 21130/ 24424 | consumed samples: 10818560 | consumed tokens: 22156410880 | elapsed time per iteration (s): 2.24 | learning rate: 2.812E-05 | global batch size: 512 | lm loss: 1.999442E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.831 | TFLOPs: 23.56 | 63: iteration 21140/ 24424 | consumed samples: 10823680 | consumed tokens: 22166896640 | elapsed time per iteration (s): 2.24 | learning rate: 2.807E-05 | global batch size: 512 | lm loss: 1.987497E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.567 | TFLOPs: 23.53 | 63: iteration 21150/ 24424 | consumed samples: 10828800 | consumed tokens: 22177382400 | elapsed time per iteration (s): 2.23 | learning rate: 2.802E-05 | global batch size: 512 | lm loss: 1.980819E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.285 | TFLOPs: 23.60 | 63: iteration 21160/ 24424 | consumed samples: 10833920 | consumed tokens: 22187868160 | elapsed time per iteration (s): 2.27 | learning rate: 2.798E-05 | global batch size: 512 | lm loss: 1.964071E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.372 | TFLOPs: 23.20 | 63: iteration 21170/ 24424 | consumed samples: 10839040 | consumed tokens: 22198353920 | elapsed time per iteration (s): 2.24 | learning rate: 2.793E-05 | global batch size: 512 | lm loss: 1.988605E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.074 | TFLOPs: 23.58 | 63: iteration 21180/ 24424 | consumed samples: 10844160 | consumed tokens: 22208839680 | elapsed time per iteration (s): 2.24 | learning rate: 2.788E-05 | global batch size: 512 | lm loss: 1.988679E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.260 | TFLOPs: 23.50 | 63: iteration 21190/ 24424 | consumed samples: 10849280 | consumed tokens: 22219325440 | elapsed time per iteration (s): 2.25 | learning rate: 2.783E-05 | global batch size: 512 | lm loss: 1.985304E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.060 | TFLOPs: 23.37 | 63: iteration 21200/ 24424 | consumed samples: 10854400 | consumed tokens: 22229811200 | elapsed time per iteration (s): 2.25 | learning rate: 2.778E-05 | global batch size: 512 | lm loss: 2.006702E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.630 | TFLOPs: 23.43 | 63: iteration 21210/ 24424 | consumed samples: 10859520 | consumed tokens: 22240296960 | elapsed time per iteration (s): 2.28 | learning rate: 2.774E-05 | global batch size: 512 | lm loss: 1.957219E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.329 | TFLOPs: 23.09 | 63: iteration 21220/ 24424 | consumed samples: 10864640 | consumed tokens: 22250782720 | elapsed time per iteration (s): 2.25 | learning rate: 2.769E-05 | global batch size: 512 | lm loss: 1.977114E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.995 | TFLOPs: 23.47 | 63: iteration 21230/ 24424 | consumed samples: 10869760 | consumed tokens: 22261268480 | elapsed time per iteration (s): 2.23 | learning rate: 2.764E-05 | global batch size: 512 | lm loss: 1.992880E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.320 | TFLOPs: 23.61 | 63: iteration 21240/ 24424 | consumed samples: 10874880 | consumed tokens: 22271754240 | elapsed time per iteration (s): 2.26 | learning rate: 2.760E-05 | global batch size: 512 | lm loss: 1.972632E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.343 | TFLOPs: 23.30 | 63: iteration 21250/ 24424 | consumed samples: 10880000 | consumed tokens: 22282240000 | elapsed time per iteration (s): 2.23 | learning rate: 2.755E-05 | global batch size: 512 | lm loss: 2.005301E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.437 | TFLOPs: 23.62 | 63: iteration 21260/ 24424 | consumed samples: 10885120 | consumed tokens: 22292725760 | elapsed time per iteration (s): 2.25 | learning rate: 2.750E-05 | global batch size: 512 | lm loss: 1.982795E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.610 | TFLOPs: 23.43 | 63: iteration 21270/ 24424 | consumed samples: 10890240 | consumed tokens: 22303211520 | elapsed time per iteration (s): 2.23 | learning rate: 2.745E-05 | global batch size: 512 | lm loss: 1.974121E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.868 | TFLOPs: 23.66 | 63: iteration 21280/ 24424 | consumed samples: 10895360 | consumed tokens: 22313697280 | elapsed time per iteration (s): 2.29 | learning rate: 2.741E-05 | global batch size: 512 | lm loss: 1.984773E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.522 | TFLOPs: 23.01 | 63: iteration 21290/ 24424 | consumed samples: 10900480 | consumed tokens: 22324183040 | elapsed time per iteration (s): 2.23 | learning rate: 2.736E-05 | global batch size: 512 | lm loss: 1.985008E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.736 | TFLOPs: 23.65 | 63: iteration 21300/ 24424 | consumed samples: 10905600 | consumed tokens: 22334668800 | elapsed time per iteration (s): 2.23 | learning rate: 2.732E-05 | global batch size: 512 | lm loss: 1.998053E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.967 | TFLOPs: 23.67 | 63: iteration 21310/ 24424 | consumed samples: 10910720 | consumed tokens: 22345154560 | elapsed time per iteration (s): 2.29 | learning rate: 2.727E-05 | global batch size: 512 | lm loss: 1.968534E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.359 | TFLOPs: 22.99 | 63: iteration 21320/ 24424 | consumed samples: 10915840 | consumed tokens: 22355640320 | elapsed time per iteration (s): 2.26 | learning rate: 2.722E-05 | global batch size: 512 | lm loss: 1.973304E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.244 | TFLOPs: 23.29 | 63: iteration 21330/ 24424 | consumed samples: 10920960 | consumed tokens: 22366126080 | elapsed time per iteration (s): 2.24 | learning rate: 2.718E-05 | global batch size: 512 | lm loss: 1.968918E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.286 | TFLOPs: 23.50 | 63: iteration 21340/ 24424 | consumed samples: 10926080 | consumed tokens: 22376611840 | elapsed time per iteration (s): 2.23 | learning rate: 2.713E-05 | global batch size: 512 | lm loss: 1.989183E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.983 | TFLOPs: 23.68 | 63: iteration 21350/ 24424 | consumed samples: 10931200 | consumed tokens: 22387097600 | elapsed time per iteration (s): 2.53 | learning rate: 2.709E-05 | global batch size: 512 | lm loss: 1.981377E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 202.650 | TFLOPs: 20.86 | 63: iteration 21360/ 24424 | consumed samples: 10936320 | consumed tokens: 22397583360 | elapsed time per iteration (s): 2.27 | learning rate: 2.704E-05 | global batch size: 512 | lm loss: 1.992059E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.409 | TFLOPs: 23.20 | 63: iteration 21370/ 24424 | consumed samples: 10941440 | consumed tokens: 22408069120 | elapsed time per iteration (s): 2.24 | learning rate: 2.700E-05 | global batch size: 512 | lm loss: 1.964170E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.138 | TFLOPs: 23.49 | 63: iteration 21380/ 24424 | consumed samples: 10946560 | consumed tokens: 22418554880 | elapsed time per iteration (s): 2.29 | learning rate: 2.695E-05 | global batch size: 512 | lm loss: 1.958467E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.906 | TFLOPs: 23.05 | 63: iteration 21390/ 24424 | consumed samples: 10951680 | consumed tokens: 22429040640 | elapsed time per iteration (s): 2.24 | learning rate: 2.691E-05 | global batch size: 512 | lm loss: 1.981227E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.691 | TFLOPs: 23.54 | 63: iteration 21400/ 24424 | consumed samples: 10956800 | consumed tokens: 22439526400 | elapsed time per iteration (s): 2.26 | learning rate: 2.686E-05 | global batch size: 512 | lm loss: 1.982675E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.566 | TFLOPs: 23.32 | 63: iteration 21410/ 24424 | consumed samples: 10961920 | consumed tokens: 22450012160 | elapsed time per iteration (s): 2.23 | learning rate: 2.682E-05 | global batch size: 512 | lm loss: 1.961126E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.612 | TFLOPs: 23.64 | 63: iteration 21420/ 24424 | consumed samples: 10967040 | consumed tokens: 22460497920 | elapsed time per iteration (s): 2.25 | learning rate: 2.677E-05 | global batch size: 512 | lm loss: 1.994034E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.012 | TFLOPs: 23.47 | 63: iteration 21430/ 24424 | consumed samples: 10972160 | consumed tokens: 22470983680 | elapsed time per iteration (s): 2.28 | learning rate: 2.673E-05 | global batch size: 512 | lm loss: 1.977519E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.809 | TFLOPs: 23.14 | 63: iteration 21440/ 24424 | consumed samples: 10977280 | consumed tokens: 22481469440 | elapsed time per iteration (s): 2.26 | learning rate: 2.668E-05 | global batch size: 512 | lm loss: 1.961452E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.621 | TFLOPs: 23.33 | 63: iteration 21450/ 24424 | consumed samples: 10982400 | consumed tokens: 22491955200 | elapsed time per iteration (s): 2.27 | learning rate: 2.664E-05 | global batch size: 512 | lm loss: 1.991504E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.737 | TFLOPs: 23.24 | 63: iteration 21460/ 24424 | consumed samples: 10987520 | consumed tokens: 22502440960 | elapsed time per iteration (s): 2.26 | learning rate: 2.659E-05 | global batch size: 512 | lm loss: 1.985658E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.282 | TFLOPs: 23.29 | 63: iteration 21470/ 24424 | consumed samples: 10992640 | consumed tokens: 22512926720 | elapsed time per iteration (s): 2.26 | learning rate: 2.655E-05 | global batch size: 512 | lm loss: 2.000998E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.139 | TFLOPs: 23.28 | 63: iteration 21480/ 24424 | consumed samples: 10997760 | consumed tokens: 22523412480 | elapsed time per iteration (s): 2.24 | learning rate: 2.651E-05 | global batch size: 512 | lm loss: 1.978811E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.753 | TFLOPs: 23.55 | 63: iteration 21490/ 24424 | consumed samples: 11002880 | consumed tokens: 22533898240 | elapsed time per iteration (s): 2.25 | learning rate: 2.646E-05 | global batch size: 512 | lm loss: 1.975091E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.808 | TFLOPs: 23.45 | 63: iteration 21500/ 24424 | consumed samples: 11008000 | consumed tokens: 22544384000 | elapsed time per iteration (s): 3.78 | learning rate: 2.642E-05 | global batch size: 512 | lm loss: 1.985184E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 135.429 | TFLOPs: 13.94 | 63: iteration 21510/ 24424 | consumed samples: 11013120 | consumed tokens: 22554869760 | elapsed time per iteration (s): 2.27 | learning rate: 2.638E-05 | global batch size: 512 | lm loss: 2.002738E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.937 | TFLOPs: 23.26 | 63: iteration 21520/ 24424 | consumed samples: 11018240 | consumed tokens: 22565355520 | elapsed time per iteration (s): 2.24 | learning rate: 2.633E-05 | global batch size: 512 | lm loss: 1.977497E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.285 | TFLOPs: 23.50 | 63: iteration 21530/ 24424 | consumed samples: 11023360 | consumed tokens: 22575841280 | elapsed time per iteration (s): 2.37 | learning rate: 2.629E-05 | global batch size: 512 | lm loss: 1.987263E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 215.817 | TFLOPs: 22.22 | 63: iteration 21540/ 24424 | consumed samples: 11028480 | consumed tokens: 22586327040 | elapsed time per iteration (s): 2.24 | learning rate: 2.625E-05 | global batch size: 512 | lm loss: 1.988892E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.179 | TFLOPs: 23.49 | 63: iteration 21550/ 24424 | consumed samples: 11033600 | consumed tokens: 22596812800 | elapsed time per iteration (s): 2.24 | learning rate: 2.620E-05 | global batch size: 512 | lm loss: 1.993516E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.670 | TFLOPs: 23.54 | 63: iteration 21560/ 24424 | consumed samples: 11038720 | consumed tokens: 22607298560 | elapsed time per iteration (s): 2.24 | learning rate: 2.616E-05 | global batch size: 512 | lm loss: 1.986994E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.186 | TFLOPs: 23.49 | 63: iteration 21570/ 24424 | consumed samples: 11043840 | consumed tokens: 22617784320 | elapsed time per iteration (s): 2.35 | learning rate: 2.612E-05 | global batch size: 512 | lm loss: 1.968944E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 217.979 | TFLOPs: 22.44 | 63: iteration 21580/ 24424 | consumed samples: 11048960 | consumed tokens: 22628270080 | elapsed time per iteration (s): 2.29 | learning rate: 2.608E-05 | global batch size: 512 | lm loss: 1.987116E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.726 | TFLOPs: 23.03 | 63: iteration 21590/ 24424 | consumed samples: 11054080 | consumed tokens: 22638755840 | elapsed time per iteration (s): 2.26 | learning rate: 2.604E-05 | global batch size: 512 | lm loss: 1.975313E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.427 | TFLOPs: 23.31 | 63: iteration 21600/ 24424 | consumed samples: 11059200 | consumed tokens: 22649241600 | elapsed time per iteration (s): 2.23 | learning rate: 2.599E-05 | global batch size: 512 | lm loss: 1.975776E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.784 | TFLOPs: 23.66 | 63: iteration 21610/ 24424 | consumed samples: 11064320 | consumed tokens: 22659727360 | elapsed time per iteration (s): 2.26 | learning rate: 2.595E-05 | global batch size: 512 | lm loss: 1.998969E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.117 | TFLOPs: 23.28 | 63: iteration 21620/ 24424 | consumed samples: 11069440 | consumed tokens: 22670213120 | elapsed time per iteration (s): 2.23 | learning rate: 2.591E-05 | global batch size: 512 | lm loss: 1.995425E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.233 | TFLOPs: 23.60 | 63: iteration 21630/ 24424 | consumed samples: 11074560 | consumed tokens: 22680698880 | elapsed time per iteration (s): 2.27 | learning rate: 2.587E-05 | global batch size: 512 | lm loss: 1.983207E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.947 | TFLOPs: 23.26 | 63: iteration 21640/ 24424 | consumed samples: 11079680 | consumed tokens: 22691184640 | elapsed time per iteration (s): 2.27 | learning rate: 2.583E-05 | global batch size: 512 | lm loss: 1.980643E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.181 | TFLOPs: 23.18 | 63: iteration 21650/ 24424 | consumed samples: 11084800 | consumed tokens: 22701670400 | elapsed time per iteration (s): 4.14 | learning rate: 2.579E-05 | global batch size: 512 | lm loss: 1.969755E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 123.668 | TFLOPs: 12.73 | 63: iteration 21660/ 24424 | consumed samples: 11089920 | consumed tokens: 22712156160 | elapsed time per iteration (s): 2.46 | learning rate: 2.574E-05 | global batch size: 512 | lm loss: 1.979448E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 207.989 | TFLOPs: 21.41 | 63: iteration 21670/ 24424 | consumed samples: 11095040 | consumed tokens: 22722641920 | elapsed time per iteration (s): 2.26 | learning rate: 2.570E-05 | global batch size: 512 | lm loss: 1.961399E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.296 | TFLOPs: 23.30 | 63: iteration 21680/ 24424 | consumed samples: 11100160 | consumed tokens: 22733127680 | elapsed time per iteration (s): 2.27 | learning rate: 2.566E-05 | global batch size: 512 | lm loss: 1.980249E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.678 | TFLOPs: 23.23 | 63: iteration 21690/ 24424 | consumed samples: 11105280 | consumed tokens: 22743613440 | elapsed time per iteration (s): 2.32 | learning rate: 2.562E-05 | global batch size: 512 | lm loss: 1.985135E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.055 | TFLOPs: 22.76 | 63: iteration 21700/ 24424 | consumed samples: 11110400 | consumed tokens: 22754099200 | elapsed time per iteration (s): 2.26 | learning rate: 2.558E-05 | global batch size: 512 | lm loss: 1.989118E+00 | grad norm: 0.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.955 | TFLOPs: 23.36 | 63: iteration 21710/ 24424 | consumed samples: 11115520 | consumed tokens: 22764584960 | elapsed time per iteration (s): 2.26 | learning rate: 2.554E-05 | global batch size: 512 | lm loss: 1.983228E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.848 | TFLOPs: 23.35 | 63: iteration 21720/ 24424 | consumed samples: 11120640 | consumed tokens: 22775070720 | elapsed time per iteration (s): 2.25 | learning rate: 2.550E-05 | global batch size: 512 | lm loss: 1.960818E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.581 | TFLOPs: 23.43 | 63: iteration 21730/ 24424 | consumed samples: 11125760 | consumed tokens: 22785556480 | elapsed time per iteration (s): 2.24 | learning rate: 2.546E-05 | global batch size: 512 | lm loss: 1.984570E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.524 | TFLOPs: 23.53 | 63: iteration 21740/ 24424 | consumed samples: 11130880 | consumed tokens: 22796042240 | elapsed time per iteration (s): 2.28 | learning rate: 2.542E-05 | global batch size: 512 | lm loss: 1.996945E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.699 | TFLOPs: 23.13 | 63: iteration 21750/ 24424 | consumed samples: 11136000 | consumed tokens: 22806528000 | elapsed time per iteration (s): 2.25 | learning rate: 2.538E-05 | global batch size: 512 | lm loss: 2.013714E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.538 | TFLOPs: 23.42 | 63: iteration 21760/ 24424 | consumed samples: 11141120 | consumed tokens: 22817013760 | elapsed time per iteration (s): 2.23 | learning rate: 2.534E-05 | global batch size: 512 | lm loss: 1.964131E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.223 | TFLOPs: 23.60 | 63: iteration 21770/ 24424 | consumed samples: 11146240 | consumed tokens: 22827499520 | elapsed time per iteration (s): 2.23 | learning rate: 2.530E-05 | global batch size: 512 | lm loss: 1.997947E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.445 | TFLOPs: 23.62 | 63: iteration 21780/ 24424 | consumed samples: 11151360 | consumed tokens: 22837985280 | elapsed time per iteration (s): 2.24 | learning rate: 2.526E-05 | global batch size: 512 | lm loss: 1.964627E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.365 | TFLOPs: 23.51 | 63: iteration 21790/ 24424 | consumed samples: 11156480 | consumed tokens: 22848471040 | elapsed time per iteration (s): 2.24 | learning rate: 2.522E-05 | global batch size: 512 | lm loss: 1.981877E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.329 | TFLOPs: 23.51 | 63: iteration 21800/ 24424 | consumed samples: 11161600 | consumed tokens: 22858956800 | elapsed time per iteration (s): 2.28 | learning rate: 2.518E-05 | global batch size: 512 | lm loss: 1.978235E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.034 | TFLOPs: 23.17 | 63: iteration 21810/ 24424 | consumed samples: 11166720 | consumed tokens: 22869442560 | elapsed time per iteration (s): 2.25 | learning rate: 2.514E-05 | global batch size: 512 | lm loss: 1.966369E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.216 | TFLOPs: 23.39 | 63: iteration 21820/ 24424 | consumed samples: 11171840 | consumed tokens: 22879928320 | elapsed time per iteration (s): 2.24 | learning rate: 2.510E-05 | global batch size: 512 | lm loss: 1.986418E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.786 | TFLOPs: 23.55 | 63: iteration 21830/ 24424 | consumed samples: 11176960 | consumed tokens: 22890414080 | elapsed time per iteration (s): 2.23 | learning rate: 2.507E-05 | global batch size: 512 | lm loss: 1.979983E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.710 | TFLOPs: 23.65 | 63: iteration 21840/ 24424 | consumed samples: 11182080 | consumed tokens: 22900899840 | elapsed time per iteration (s): 4.20 | learning rate: 2.503E-05 | global batch size: 512 | lm loss: 1.979874E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 121.961 | TFLOPs: 12.56 | 63: iteration 21850/ 24424 | consumed samples: 11187200 | consumed tokens: 22911385600 | elapsed time per iteration (s): 2.24 | learning rate: 2.499E-05 | global batch size: 512 | lm loss: 2.010069E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.895 | TFLOPs: 23.56 | 63: iteration 21860/ 24424 | consumed samples: 11192320 | consumed tokens: 22921871360 | elapsed time per iteration (s): 2.29 | learning rate: 2.495E-05 | global batch size: 512 | lm loss: 1.997952E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.180 | TFLOPs: 22.98 | 63: iteration 21870/ 24424 | consumed samples: 11197440 | consumed tokens: 22932357120 | elapsed time per iteration (s): 2.28 | learning rate: 2.491E-05 | global batch size: 512 | lm loss: 1.978101E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.512 | TFLOPs: 23.11 | 63: iteration 21880/ 24424 | consumed samples: 11202560 | consumed tokens: 22942842880 | elapsed time per iteration (s): 2.26 | learning rate: 2.487E-05 | global batch size: 512 | lm loss: 1.986194E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.937 | TFLOPs: 23.36 | 63: iteration 21890/ 24424 | consumed samples: 11207680 | consumed tokens: 22953328640 | elapsed time per iteration (s): 2.23 | learning rate: 2.484E-05 | global batch size: 512 | lm loss: 1.981727E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.505 | TFLOPs: 23.63 | 63: iteration 21900/ 24424 | consumed samples: 11212800 | consumed tokens: 22963814400 | elapsed time per iteration (s): 2.23 | learning rate: 2.480E-05 | global batch size: 512 | lm loss: 1.985639E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.966 | TFLOPs: 23.67 | 63: iteration 21910/ 24424 | consumed samples: 11217920 | consumed tokens: 22974300160 | elapsed time per iteration (s): 2.23 | learning rate: 2.476E-05 | global batch size: 512 | lm loss: 2.000033E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.409 | TFLOPs: 23.62 | 63: iteration 21920/ 24424 | consumed samples: 11223040 | consumed tokens: 22984785920 | elapsed time per iteration (s): 2.23 | learning rate: 2.472E-05 | global batch size: 512 | lm loss: 1.983112E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.439 | TFLOPs: 23.62 | 63: iteration 21930/ 24424 | consumed samples: 11228160 | consumed tokens: 22995271680 | elapsed time per iteration (s): 2.34 | learning rate: 2.469E-05 | global batch size: 512 | lm loss: 1.980289E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.440 | TFLOPs: 22.49 | 63: iteration 21940/ 24424 | consumed samples: 11233280 | consumed tokens: 23005757440 | elapsed time per iteration (s): 2.23 | learning rate: 2.465E-05 | global batch size: 512 | lm loss: 1.966248E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.559 | TFLOPs: 23.63 | 63: iteration 21950/ 24424 | consumed samples: 11238400 | consumed tokens: 23016243200 | elapsed time per iteration (s): 2.32 | learning rate: 2.461E-05 | global batch size: 512 | lm loss: 1.980466E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.712 | TFLOPs: 22.72 | 63: iteration 21960/ 24424 | consumed samples: 11243520 | consumed tokens: 23026728960 | elapsed time per iteration (s): 2.29 | learning rate: 2.458E-05 | global batch size: 512 | lm loss: 1.970092E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.708 | TFLOPs: 23.03 | 63: iteration 21970/ 24424 | consumed samples: 11248640 | consumed tokens: 23037214720 | elapsed time per iteration (s): 2.31 | learning rate: 2.454E-05 | global batch size: 512 | lm loss: 2.003779E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.261 | TFLOPs: 22.78 | 63: iteration 21980/ 24424 | consumed samples: 11253760 | consumed tokens: 23047700480 | elapsed time per iteration (s): 4.00 | learning rate: 2.450E-05 | global batch size: 512 | lm loss: 1.995969E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 128.032 | TFLOPs: 13.18 | 63: iteration 21990/ 24424 | consumed samples: 11258880 | consumed tokens: 23058186240 | elapsed time per iteration (s): 2.23 | learning rate: 2.447E-05 | global batch size: 512 | lm loss: 1.975920E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.550 | TFLOPs: 23.63 | 0: [2022-11-26 07:58:38,200] [INFO] [logging.py:68:log_dist] [Rank 0] step=22000, skipped=0, lr=[2.4429231063684326e-05, 2.4429231063684326e-05, 2.4429231063684326e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 22000/ 24424 | consumed samples: 11264000 | consumed tokens: 23068672000 | elapsed time per iteration (s): 2.24 | learning rate: 2.443E-05 | global batch size: 512 | lm loss: 1.965941E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.822 | TFLOPs: 23.56 | 0: steps: 22000 loss: 1.9113 iter time (s): 2.335 samples/sec: 219.315 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 22000 | lm loss value: 1.948691E+00 | lm loss PPL: 7.019491E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 22000 to checkpoints_3b9 0: [2022-11-26 07:58:38,985] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step22000 is begin to save! 0: [2022-11-26 07:58:39,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_01-model_00-model_states.pt... 32: [2022-11-26 07:58:39,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_21-model_00-model_states.pt... 32: [2022-11-26 07:58:39,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_21-model_00-model_states.pt. 32: [2022-11-26 07:58:39,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_22-model_00-model_states.pt... 0: [2022-11-26 07:58:39,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_01-model_00-model_states.pt. 0: [2022-11-26 07:58:39,386] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_03-model_00-model_states.pt... 32: [2022-11-26 07:58:39,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_22-model_00-model_states.pt. 32: [2022-11-26 07:58:39,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_23-model_00-model_states.pt... 0: [2022-11-26 07:58:39,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_03-model_00-model_states.pt. 0: [2022-11-26 07:58:39,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_04-model_00-model_states.pt... 32: [2022-11-26 07:58:39,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_23-model_00-model_states.pt. 32: [2022-11-26 07:58:39,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_24-model_00-model_states.pt... 0: [2022-11-26 07:58:39,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_04-model_00-model_states.pt. 0: [2022-11-26 07:58:39,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_05-model_00-model_states.pt... 32: [2022-11-26 07:58:39,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_24-model_00-model_states.pt. 32: [2022-11-26 07:58:39,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_25-model_00-model_states.pt... 0: [2022-11-26 07:58:40,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_05-model_00-model_states.pt. 0: [2022-11-26 07:58:40,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_06-model_00-model_states.pt... 32: [2022-11-26 07:58:40,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_25-model_00-model_states.pt. 32: [2022-11-26 07:58:40,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_26-model_00-model_states.pt... 0: [2022-11-26 07:58:40,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_06-model_00-model_states.pt. 0: [2022-11-26 07:58:40,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_07-model_00-model_states.pt... 32: [2022-11-26 07:58:40,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_26-model_00-model_states.pt. 32: [2022-11-26 07:58:40,437] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_27-model_00-model_states.pt... 0: [2022-11-26 07:58:40,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_07-model_00-model_states.pt. 0: [2022-11-26 07:58:40,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_08-model_00-model_states.pt... 32: [2022-11-26 07:58:40,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_27-model_00-model_states.pt. 32: [2022-11-26 07:58:40,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_28-model_00-model_states.pt... 0: [2022-11-26 07:58:40,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_08-model_00-model_states.pt. 0: [2022-11-26 07:58:40,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_09-model_00-model_states.pt... 32: [2022-11-26 07:58:40,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_28-model_00-model_states.pt. 32: [2022-11-26 07:58:40,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_29-model_00-model_states.pt... 0: [2022-11-26 07:58:41,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_09-model_00-model_states.pt. 0: [2022-11-26 07:58:41,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_10-model_00-model_states.pt... 32: [2022-11-26 07:58:41,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_29-model_00-model_states.pt. 32: [2022-11-26 07:58:41,143] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_30-model_00-model_states.pt... 0: [2022-11-26 07:58:41,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_10-model_00-model_states.pt. 0: [2022-11-26 07:58:41,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_11-model_00-model_states.pt... 32: [2022-11-26 07:58:41,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_30-model_00-model_states.pt. 32: [2022-11-26 07:58:41,374] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_31-model_00-model_states.pt... 0: [2022-11-26 07:58:41,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_11-model_00-model_states.pt. 0: [2022-11-26 07:58:41,500] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_12-model_00-model_states.pt... 32: [2022-11-26 07:58:41,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_31-model_00-model_states.pt. 32: [2022-11-26 07:58:41,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_32-model_00-model_states.pt... 0: [2022-11-26 07:58:41,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_12-model_00-model_states.pt. 0: [2022-11-26 07:58:41,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_13-model_00-model_states.pt... 32: [2022-11-26 07:58:41,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_32-model_00-model_states.pt. 32: [2022-11-26 07:58:41,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_33-model_00-model_states.pt... 0: [2022-11-26 07:58:41,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_13-model_00-model_states.pt. 0: [2022-11-26 07:58:41,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_14-model_00-model_states.pt... 32: [2022-11-26 07:58:42,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_33-model_00-model_states.pt. 32: [2022-11-26 07:58:42,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_34-model_00-model_states.pt... 0: [2022-11-26 07:58:42,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_14-model_00-model_states.pt. 0: [2022-11-26 07:58:42,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_15-model_00-model_states.pt... 32: [2022-11-26 07:58:42,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_34-model_00-model_states.pt. 32: [2022-11-26 07:58:42,294] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_35-model_00-model_states.pt... 0: [2022-11-26 07:58:42,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_15-model_00-model_states.pt. 0: [2022-11-26 07:58:42,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_16-model_00-model_states.pt... 32: [2022-11-26 07:58:42,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_35-model_00-model_states.pt. 32: [2022-11-26 07:58:42,523] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_36-model_00-model_states.pt... 0: [2022-11-26 07:58:42,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_16-model_00-model_states.pt. 0: [2022-11-26 07:58:42,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_17-model_00-model_states.pt... 32: [2022-11-26 07:58:42,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_36-model_00-model_states.pt. 32: [2022-11-26 07:58:42,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_37-model_00-model_states.pt... 0: [2022-11-26 07:58:42,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_17-model_00-model_states.pt. 0: [2022-11-26 07:58:42,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_18-model_00-model_states.pt... 32: [2022-11-26 07:58:42,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_37-model_00-model_states.pt. 32: [2022-11-26 07:58:42,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_38-model_00-model_states.pt... 0: [2022-11-26 07:58:43,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_18-model_00-model_states.pt. 0: [2022-11-26 07:58:43,048] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_19-model_00-model_states.pt... 32: [2022-11-26 07:58:43,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_38-model_00-model_states.pt. 32: [2022-11-26 07:58:43,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_40-model_00-model_states.pt... 32: [2022-11-26 07:58:43,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_40-model_00-model_states.pt. 32: [2022-11-26 07:58:43,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/mp_rank_01_model_states.pt... 32: [2022-11-26 07:58:43,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/mp_rank_01_model_states.pt. 0: [2022-11-26 07:58:43,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_19-model_00-model_states.pt. 0: [2022-11-26 07:58:43,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/layer_20-model_00-model_states.pt... 0: [2022-11-26 07:58:43,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/layer_20-model_00-model_states.pt. 0: [2022-11-26 07:58:43,485] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step22000/mp_rank_00_model_states.pt 0: [2022-11-26 07:58:43,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/mp_rank_00_model_states.pt... 0: [2022-11-26 07:58:43,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/mp_rank_00_model_states.pt. 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 42: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 46: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 41: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 45: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 53: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 61: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 56: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 10: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 27: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 34: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 32: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 36: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 37: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 49: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 38: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 1: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 7: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 28: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 11: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 9: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 22: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 30: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 15: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 16: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 8: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 31: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 12: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 19: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 2: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 5: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 21: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 4: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 3: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 6: [2022-11-26 07:58:43,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 0: [2022-11-26 07:58:43,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:43,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:58:43,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:43,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:43,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:58:43,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:58:43,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:43,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:43,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:43,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:43,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:43,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:43,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 07:58:43,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:43,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:58:43,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:43,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:58:43,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:58:43,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:58:43,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 1: [2022-11-26 07:58:43,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:43,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:43,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 07:58:43,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:43,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:58:43,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:43,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:58:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:58:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 44: [2022-11-26 07:58:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:58:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 49: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:43,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 14: [2022-11-26 07:58:43,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:43,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 49: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:58:43,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:58:43,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 07:58:43,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:43,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 57: [2022-11-26 07:58:43,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 45: [2022-11-26 07:58:43,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:43,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:43,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:58:43,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 07:58:43,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:43,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:58:43,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:58:43,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 60: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 8: [2022-11-26 07:58:43,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 60: [2022-11-26 07:58:43,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 17: [2022-11-26 07:58:43,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:43,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:43,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:43,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:43,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:43,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:58:43,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 07:58:43,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:43,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:43,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 07:58:43,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 52: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:58:43,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:58:43,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 40: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 13: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:43,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 16: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 40: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 61: [2022-11-26 07:58:43,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:43,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:43,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:58:43,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:58:43,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 10: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:43,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 45: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 1: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:43,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:58:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 30: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 62: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 62: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 20: [2022-11-26 07:58:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 62: [2022-11-26 07:58:43,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 30: [2022-11-26 07:58:43,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:43,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:43,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:43,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:43,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 15: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 48: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:43,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:43,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 6: [2022-11-26 07:58:43,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 50: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:43,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:43,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:43,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 49: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:43,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:43,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:43,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:43,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:43,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:43,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:58:43,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:43,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:43,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:43,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:43,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 31: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 42: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 42: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 36: [2022-11-26 07:58:43,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:43,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:43,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 52: [2022-11-26 07:58:43,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:58:43,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:43,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:58:43,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:43,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:43,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 07:58:43,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:43,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:43,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:43,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:58:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:58:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:58:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:43,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:43,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 20: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 43: [2022-11-26 07:58:43,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:43,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:43,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 33: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 26: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 33: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 25: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 33: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 54: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 20: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:58:43,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:58:43,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:43,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 41: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:43,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:43,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 57: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:58:43,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:43,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:58:43,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:43,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 36: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 17: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 45: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 19: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:43,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 07:58:43,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 6: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 35: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 16: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 47: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 16: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 47: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 12: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:43,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 12: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:58:43,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 42: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:43,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 49: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:43,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:43,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:43,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:58:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:43,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:43,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:43,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:43,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:58:43,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 62: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 18: [2022-11-26 07:58:43,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 62: [2022-11-26 07:58:43,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 0: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 34: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 1: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:43,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:43,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 1: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:43,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 36: [2022-11-26 07:58:43,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:43,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:43,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:43,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:43,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:43,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:43,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:43,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:43,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:43,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:58:43,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 60: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:43,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:43,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:43,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:43,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:43,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 52: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:58:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:43,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 51: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 10: [2022-11-26 07:58:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 51: [2022-11-26 07:58:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 10: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:58:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:43,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:43,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 43: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 17: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:43,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 43: [2022-11-26 07:58:43,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 17: [2022-11-26 07:58:43,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 28: [2022-11-26 07:58:43,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 17: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:43,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:43,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:43,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:43,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:43,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 2: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 57: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 16: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 38: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:43,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:43,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:58:43,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:58:43,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:43,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:43,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:43,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:43,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 18: [2022-11-26 07:58:43,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:58:43,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 62: [2022-11-26 07:58:43,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 18: [2022-11-26 07:58:43,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:43,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:43,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:43,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:43,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:43,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:43,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:43,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:43,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:43,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:43,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:43,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:43,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:43,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:43,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 07:58:43,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:58:43,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:43,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:43,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:58:43,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:43,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:43,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:58:43,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:43,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:43,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:58:43,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:43,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:43,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:43,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 47: [2022-11-26 07:58:43,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 21: [2022-11-26 07:58:43,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:43,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:43,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:43,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:43,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:43,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:43,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 07:58:43,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:43,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:58:43,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:43,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:43,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:43,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:58:43,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:43,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:43,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:43,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:58:43,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 07:58:43,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:58:43,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:43,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:43,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:43,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:58:43,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:43,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:43,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:43,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:43,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:43,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:43,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:58:43,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:43,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:43,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:58:43,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:43,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:43,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 6: [2022-11-26 07:58:43,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 41: [2022-11-26 07:58:43,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 6: [2022-11-26 07:58:43,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 41: [2022-11-26 07:58:43,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:43,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:43,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 07:58:43,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:43,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:43,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:43,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:43,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:43,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:43,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:43,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:43,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:43,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 52: [2022-11-26 07:58:43,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:43,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:43,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:43,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:58:43,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:43,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:43,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:58:43,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 07:58:43,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:43,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:58:43,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 07:58:43,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:43,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:43,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:43,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 60: [2022-11-26 07:58:43,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:43,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:43,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:58:43,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:43,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:43,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:43,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:43,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:43,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:43,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:43,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:43,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:43,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:43,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:43,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:58:43,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 07:58:43,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 36: [2022-11-26 07:58:43,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:43,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:43,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:43,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:43,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:43,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:43,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:43,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:43,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 12: [2022-11-26 07:58:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:58:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:58:43,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:43,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 49: [2022-11-26 07:58:43,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:43,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:43,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:43,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:43,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:43,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 56: [2022-11-26 07:58:43,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:43,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:43,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:43,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:43,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 07:58:43,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:58:43,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:43,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:58:43,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:43,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:43,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:58:43,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:43,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:43,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:43,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:43,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:43,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:43,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:43,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:43,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:43,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:43,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:58:43,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:58:43,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:43,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:43,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 10: [2022-11-26 07:58:43,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 43: [2022-11-26 07:58:43,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 07:58:43,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 07:58:43,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:43,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:43,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 07:58:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:58:43,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 07:58:43,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:58:43,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:43,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 07:58:43,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:43,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:43,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 29: [2022-11-26 07:58:43,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:43,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:58:43,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:43,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:43,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:43,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 07:58:43,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 17: [2022-11-26 07:58:43,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:43,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:43,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:43,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:43,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:58:43,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:43,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:43,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:43,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:43,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:43,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:43,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:43,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:43,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:43,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:43,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:58:43,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:43,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:43,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:43,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:43,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:58:43,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:43,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:43,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:43,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:43,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:58:43,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:43,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:43,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:58:43,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:43,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:43,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:58:43,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 07:58:43,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:58:43,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 41: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 4: [2022-11-26 07:58:43,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 41: [2022-11-26 07:58:43,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 4: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 45: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 1: [2022-11-26 07:58:43,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 60: [2022-11-26 07:58:43,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 42: [2022-11-26 07:58:43,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 6: [2022-11-26 07:58:43,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 42: [2022-11-26 07:58:43,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 6: [2022-11-26 07:58:43,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:58:43,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:43,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:43,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:43,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:43,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:43,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:43,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:43,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:43,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:43,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:43,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 07:58:43,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 52: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 11: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:58:43,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 22: [2022-11-26 07:58:43,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 59: [2022-11-26 07:58:43,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:43,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:43,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 07:58:43,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:58:43,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:43,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:43,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:43,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:43,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 12: [2022-11-26 07:58:43,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:43,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:43,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:43,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:43,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:58:43,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:43,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 49: [2022-11-26 07:58:43,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:43,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:43,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:43,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:43,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:43,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:43,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:43,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:58:43,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:43,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:43,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:43,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:43,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:43,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:43,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:43,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:43,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:43,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:58:43,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 07:58:43,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 57: [2022-11-26 07:58:43,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 0: [2022-11-26 07:58:43,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 57: [2022-11-26 07:58:43,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:58:43,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:43,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 36: [2022-11-26 07:58:43,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:43,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:43,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:43,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:58:43,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:43,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:43,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:58:43,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:43,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:43,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:58:43,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 28: [2022-11-26 07:58:43,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 43: [2022-11-26 07:58:43,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:43,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 33: [2022-11-26 07:58:43,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 28: [2022-11-26 07:58:43,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:43,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:43,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:43,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:43,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 07:58:43,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:58:43,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:43,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:43,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:58:43,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 07:58:43,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:43,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:58:43,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 56: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 10: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:43,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 53: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:43,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 2: [2022-11-26 07:58:43,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 53: [2022-11-26 07:58:43,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:43,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:43,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:43,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:43,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:43,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:58:43,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 07:58:43,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:43,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:43,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 29: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 34: [2022-11-26 07:58:43,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 26: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:43,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 34: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:43,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:58:43,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:43,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 1: [2022-11-26 07:58:43,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:43,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:43,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:43,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:43,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:58:43,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 32: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 19: [2022-11-26 07:58:43,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:43,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 17: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 17: [2022-11-26 07:58:43,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:43,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:43,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:43,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 07:58:43,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:43,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:58:43,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:43,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:43,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 4: [2022-11-26 07:58:43,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 42: [2022-11-26 07:58:43,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 4: [2022-11-26 07:58:43,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 42: [2022-11-26 07:58:43,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:43,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 60: [2022-11-26 07:58:43,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:43,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:43,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:58:43,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 39: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 11: [2022-11-26 07:58:43,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 39: [2022-11-26 07:58:43,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 11: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:43,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:43,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:43,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:43,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:43,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:43,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:43,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:43,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 13: [2022-11-26 07:58:43,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 44: [2022-11-26 07:58:43,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 07:58:43,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:43,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:43,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:43,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 9: [2022-11-26 07:58:43,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 51: [2022-11-26 07:58:43,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:43,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:43,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:43,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:43,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:43,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:43,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:43,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:58:43,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:43,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:43,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:43,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:43,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:43,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:43,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:43,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:43,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:43,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:43,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:43,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:43,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:43,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:43,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:58:43,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:43,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:43,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 07:58:43,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 07:58:43,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 52: [2022-11-26 07:58:43,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:58:43,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:43,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:43,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:43,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:43,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 36: [2022-11-26 07:58:43,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:43,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:43,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:43,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:43,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:43,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:43,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:43,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:43,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 07:58:43,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:43,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:43,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:43,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 49: [2022-11-26 07:58:43,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:43,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:43,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:43,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:58:43,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:43,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:43,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:58:43,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:43,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:43,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:58:43,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:43,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:43,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:58:43,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:43,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:43,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 07:58:43,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:43,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:43,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:43,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 56: [2022-11-26 07:58:43,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:43,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:58:43,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 07:58:43,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:43,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:43,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:43,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:43,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:58:43,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 43: [2022-11-26 07:58:43,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:43,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:43,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:43,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:43,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:43,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:43,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:43,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:43,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 07:58:43,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:58:44,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 16: [2022-11-26 07:58:44,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 07:58:44,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:44,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:58:44,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:44,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:44,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 07:58:44,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:44,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:44,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:44,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 07:58:44,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:44,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:44,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:44,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:44,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:58:44,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 19: [2022-11-26 07:58:44,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:44,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 19: [2022-11-26 07:58:44,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 13: [2022-11-26 07:58:44,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:44,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:58:44,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:44,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:44,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:58:44,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:44,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:44,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:44,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 07:58:44,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:44,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:44,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 07:58:44,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:44,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:44,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 07:58:44,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:44,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:58:44,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 07:58:44,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 1: [2022-11-26 07:58:44,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:44,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:44,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:44,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:58:44,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:44,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:44,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:58:44,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 07:58:44,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 24: [2022-11-26 07:58:44,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:44,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:44,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:44,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:44,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:44,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:44,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:44,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:44,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:44,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:58:44,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 07:58:44,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:44,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 17: [2022-11-26 07:58:44,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 45: [2022-11-26 07:58:44,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 07:58:44,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 17: [2022-11-26 07:58:44,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:44,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:44,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:58:44,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:44,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:44,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:58:44,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 07:58:44,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 25: [2022-11-26 07:58:44,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 25: [2022-11-26 07:58:44,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:44,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 60: [2022-11-26 07:58:44,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:44,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:44,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:44,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:58:44,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:44,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:44,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 07:58:44,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 07:58:44,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:44,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 44: [2022-11-26 07:58:44,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 11: [2022-11-26 07:58:44,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 44: [2022-11-26 07:58:44,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 11: [2022-11-26 07:58:44,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:44,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:44,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 07:58:44,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:44,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:58:44,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 55: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:44,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 52: [2022-11-26 07:58:44,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:44,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 07:58:44,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:44,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 07:58:44,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:44,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:44,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:44,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:44,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:44,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:44,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:44,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:44,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:58:44,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:44,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:44,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:44,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:44,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:44,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 07:58:44,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:44,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:44,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:44,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:44,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:44,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:44,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:44,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:44,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:44,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:44,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 07:58:44,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:44,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:44,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:44,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:44,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:44,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:44,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:44,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 07:58:44,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 12: [2022-11-26 07:58:44,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:44,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 07:58:44,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 16: [2022-11-26 07:58:44,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 07:58:44,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 07:58:44,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 14: [2022-11-26 07:58:44,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 07:58:44,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 07:58:44,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:44,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 30: [2022-11-26 07:58:44,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:58:44,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 30: [2022-11-26 07:58:44,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:44,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 26: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:44,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 10: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 26: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 10: [2022-11-26 07:58:44,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 0: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 13: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 10: [2022-11-26 07:58:44,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:44,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:44,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 7: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 07:58:44,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:44,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 59: [2022-11-26 07:58:44,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 3: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 46: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 59: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 3: [2022-11-26 07:58:44,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 29: [2022-11-26 07:58:44,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 07:58:44,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 07:58:44,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 1: [2022-11-26 07:58:44,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 07:58:44,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 07:58:44,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 32: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 07:58:44,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 21: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 32: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 21: [2022-11-26 07:58:44,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 21: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 18: [2022-11-26 07:58:44,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 27: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 07:58:44,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 07:58:44,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:44,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 24: [2022-11-26 07:58:44,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 6: [2022-11-26 07:58:44,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:44,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 07:58:44,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 6: [2022-11-26 07:58:44,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 60: [2022-11-26 07:58:44,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 07:58:44,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 51: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 07:58:44,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:44,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 8: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 56: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:44,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:44,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 62: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 07:58:44,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 19: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 38: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 19: [2022-11-26 07:58:44,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 38: [2022-11-26 07:58:44,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 22: [2022-11-26 07:58:44,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 15: [2022-11-26 07:58:44,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 07:58:44,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 61: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 15: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 61: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 20: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 20: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 17: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 45: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 44: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 45: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 44: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 4: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 44: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 4: [2022-11-26 07:58:44,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 07:58:44,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:44,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 63: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 25: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:44,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 25: [2022-11-26 07:58:44,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 2: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 07:58:44,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 35: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 07:58:44,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 07:58:44,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 34: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:58:44,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 34: [2022-11-26 07:58:44,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 40: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 40: [2022-11-26 07:58:44,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 42: [2022-11-26 07:58:44,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 42: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 36: [2022-11-26 07:58:44,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 07:58:44,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:44,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 41: [2022-11-26 07:58:44,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 07:58:44,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 07:58:44,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 23: [2022-11-26 07:58:44,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 07:58:44,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 47: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 07:58:44,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 49: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 52: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 49: [2022-11-26 07:58:44,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:44,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 52: [2022-11-26 07:58:44,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 49: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 52: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 57: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 07:58:44,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 07:58:44,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 48: [2022-11-26 07:58:44,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 07:58:44,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 07:58:44,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 50: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 07:58:44,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 50: [2022-11-26 07:58:44,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 55: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 50: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:44,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:58:44,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:44,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 46: [2022-11-26 07:58:44,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 07:58:44,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 07:58:44,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 5: [2022-11-26 07:58:44,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:44,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 5: [2022-11-26 07:58:44,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 07:58:44,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 12: [2022-11-26 07:58:44,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 11: [2022-11-26 07:58:44,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 12: [2022-11-26 07:58:44,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 11: [2022-11-26 07:58:44,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 07:58:44,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 8: [2022-11-26 07:58:44,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 07:58:44,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 07:58:44,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:44,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:44,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:58:44,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:44,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 37: [2022-11-26 07:58:44,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 07:58:44,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 07:58:44,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:44,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 07:58:44,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 31: [2022-11-26 07:58:44,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 07:58:44,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 07:58:44,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 38: [2022-11-26 07:58:44,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 56: [2022-11-26 07:58:44,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 38: [2022-11-26 07:58:44,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 07:58:44,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 56: [2022-11-26 07:58:44,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 07:58:44,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:44,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:44,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 43: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 36: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 07:58:44,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 07:58:44,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 43: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:44,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:44,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 07:58:44,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:44,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:44,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:44,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 54: [2022-11-26 07:58:44,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 07:58:44,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:44,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 9: [2022-11-26 07:58:44,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 63: [2022-11-26 07:58:44,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:44,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 9: [2022-11-26 07:58:44,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 41: [2022-11-26 07:58:44,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 9: [2022-11-26 07:58:44,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 41: [2022-11-26 07:58:44,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 07:58:44,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 53: [2022-11-26 07:58:44,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 07:58:44,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 07:58:44,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 63: [2022-11-26 07:58:44,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 63: [2022-11-26 07:58:44,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 63: [2022-11-26 07:58:44,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:44,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:44,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:44,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:44,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:44,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 07:58:44,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:44,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 58: [2022-11-26 07:58:44,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 07:58:44,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:44,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:58:44,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:44,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 28: [2022-11-26 07:58:44,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 07:58:44,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 07:58:44,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 33: [2022-11-26 07:58:44,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 07:58:44,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 07:58:44,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 39: [2022-11-26 07:58:44,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 07:58:44,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step22000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 07:58:44,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! 0: successfully saved checkpoint at iteration 22000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5179.13 63: iteration 22010/ 24424 | consumed samples: 11269120 | consumed tokens: 23079157760 | elapsed time per iteration (s): 2.84 | learning rate: 2.439E-05 | global batch size: 512 | lm loss: 1.972377E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.592 | TFLOPs: 18.59 | 63: iteration 22020/ 24424 | consumed samples: 11274240 | consumed tokens: 23089643520 | elapsed time per iteration (s): 2.23 | learning rate: 2.436E-05 | global batch size: 512 | lm loss: 1.989596E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.315 | TFLOPs: 23.61 | 63: iteration 22030/ 24424 | consumed samples: 11279360 | consumed tokens: 23100129280 | elapsed time per iteration (s): 2.23 | learning rate: 2.432E-05 | global batch size: 512 | lm loss: 1.974196E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.873 | TFLOPs: 23.66 | 63: iteration 22040/ 24424 | consumed samples: 11284480 | consumed tokens: 23110615040 | elapsed time per iteration (s): 2.41 | learning rate: 2.429E-05 | global batch size: 512 | lm loss: 1.982949E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 212.088 | TFLOPs: 21.83 | 63: iteration 22050/ 24424 | consumed samples: 11289600 | consumed tokens: 23121100800 | elapsed time per iteration (s): 2.32 | learning rate: 2.425E-05 | global batch size: 512 | lm loss: 1.984870E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.225 | TFLOPs: 22.67 | 63: iteration 22060/ 24424 | consumed samples: 11294720 | consumed tokens: 23131586560 | elapsed time per iteration (s): 2.23 | learning rate: 2.421E-05 | global batch size: 512 | lm loss: 1.990956E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.702 | TFLOPs: 23.65 | 63: iteration 22070/ 24424 | consumed samples: 11299840 | consumed tokens: 23142072320 | elapsed time per iteration (s): 2.24 | learning rate: 2.418E-05 | global batch size: 512 | lm loss: 1.968717E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.324 | TFLOPs: 23.50 | 63: iteration 22080/ 24424 | consumed samples: 11304960 | consumed tokens: 23152558080 | elapsed time per iteration (s): 2.24 | learning rate: 2.414E-05 | global batch size: 512 | lm loss: 1.993357E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.446 | TFLOPs: 23.52 | 63: iteration 22090/ 24424 | consumed samples: 11310080 | consumed tokens: 23163043840 | elapsed time per iteration (s): 2.24 | learning rate: 2.411E-05 | global batch size: 512 | lm loss: 1.959483E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.251 | TFLOPs: 23.50 | 63: iteration 22100/ 24424 | consumed samples: 11315200 | consumed tokens: 23173529600 | elapsed time per iteration (s): 2.23 | learning rate: 2.407E-05 | global batch size: 512 | lm loss: 1.964796E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.911 | TFLOPs: 23.67 | 63: iteration 22110/ 24424 | consumed samples: 11320320 | consumed tokens: 23184015360 | elapsed time per iteration (s): 2.23 | learning rate: 2.404E-05 | global batch size: 512 | lm loss: 1.982170E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.880 | TFLOPs: 23.67 | 63: iteration 22120/ 24424 | consumed samples: 11325440 | consumed tokens: 23194501120 | elapsed time per iteration (s): 2.24 | learning rate: 2.400E-05 | global batch size: 512 | lm loss: 1.977081E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.271 | TFLOPs: 23.50 | 63: iteration 22130/ 24424 | consumed samples: 11330560 | consumed tokens: 23204986880 | elapsed time per iteration (s): 2.25 | learning rate: 2.397E-05 | global batch size: 512 | lm loss: 1.982951E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.480 | TFLOPs: 23.42 | 63: iteration 22140/ 24424 | consumed samples: 11335680 | consumed tokens: 23215472640 | elapsed time per iteration (s): 2.24 | learning rate: 2.394E-05 | global batch size: 512 | lm loss: 1.978527E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.813 | TFLOPs: 23.56 | 63: iteration 22150/ 24424 | consumed samples: 11340800 | consumed tokens: 23225958400 | elapsed time per iteration (s): 2.23 | learning rate: 2.390E-05 | global batch size: 512 | lm loss: 1.966412E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.435 | TFLOPs: 23.62 | 63: iteration 22160/ 24424 | consumed samples: 11345920 | consumed tokens: 23236444160 | elapsed time per iteration (s): 2.25 | learning rate: 2.387E-05 | global batch size: 512 | lm loss: 1.977305E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.951 | TFLOPs: 23.47 | 63: iteration 22170/ 24424 | consumed samples: 11351040 | consumed tokens: 23246929920 | elapsed time per iteration (s): 2.26 | learning rate: 2.383E-05 | global batch size: 512 | lm loss: 1.971253E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.379 | TFLOPs: 23.30 | 63: iteration 22180/ 24424 | consumed samples: 11356160 | consumed tokens: 23257415680 | elapsed time per iteration (s): 2.24 | learning rate: 2.380E-05 | global batch size: 512 | lm loss: 1.978653E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.150 | TFLOPs: 23.49 | 63: iteration 22190/ 24424 | consumed samples: 11361280 | consumed tokens: 23267901440 | elapsed time per iteration (s): 2.23 | learning rate: 2.377E-05 | global batch size: 512 | lm loss: 1.984809E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.380 | TFLOPs: 23.61 | 63: iteration 22200/ 24424 | consumed samples: 11366400 | consumed tokens: 23278387200 | elapsed time per iteration (s): 2.26 | learning rate: 2.373E-05 | global batch size: 512 | lm loss: 1.981178E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.486 | TFLOPs: 23.32 | 63: iteration 22210/ 24424 | consumed samples: 11371520 | consumed tokens: 23288872960 | elapsed time per iteration (s): 2.29 | learning rate: 2.370E-05 | global batch size: 512 | lm loss: 1.987161E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.863 | TFLOPs: 23.05 | 63: iteration 22220/ 24424 | consumed samples: 11376640 | consumed tokens: 23299358720 | elapsed time per iteration (s): 2.31 | learning rate: 2.367E-05 | global batch size: 512 | lm loss: 1.961190E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.854 | TFLOPs: 22.84 | 63: iteration 22230/ 24424 | consumed samples: 11381760 | consumed tokens: 23309844480 | elapsed time per iteration (s): 2.26 | learning rate: 2.363E-05 | global batch size: 512 | lm loss: 1.979386E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.328 | TFLOPs: 23.30 | 63: iteration 22240/ 24424 | consumed samples: 11386880 | consumed tokens: 23320330240 | elapsed time per iteration (s): 2.23 | learning rate: 2.360E-05 | global batch size: 512 | lm loss: 1.981816E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.917 | TFLOPs: 23.67 | 63: iteration 22250/ 24424 | consumed samples: 11392000 | consumed tokens: 23330816000 | elapsed time per iteration (s): 2.26 | learning rate: 2.357E-05 | global batch size: 512 | lm loss: 1.963947E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.772 | TFLOPs: 23.35 | 63: iteration 22260/ 24424 | consumed samples: 11397120 | consumed tokens: 23341301760 | elapsed time per iteration (s): 2.25 | learning rate: 2.354E-05 | global batch size: 512 | lm loss: 1.986249E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.844 | TFLOPs: 23.46 | 63: iteration 22270/ 24424 | consumed samples: 11402240 | consumed tokens: 23351787520 | elapsed time per iteration (s): 2.26 | learning rate: 2.350E-05 | global batch size: 512 | lm loss: 1.973217E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.455 | TFLOPs: 23.31 | 63: iteration 22280/ 24424 | consumed samples: 11407360 | consumed tokens: 23362273280 | elapsed time per iteration (s): 2.26 | learning rate: 2.347E-05 | global batch size: 512 | lm loss: 1.964544E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.714 | TFLOPs: 23.34 | 63: iteration 22290/ 24424 | consumed samples: 11412480 | consumed tokens: 23372759040 | elapsed time per iteration (s): 4.59 | learning rate: 2.344E-05 | global batch size: 512 | lm loss: 1.983331E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 111.495 | TFLOPs: 11.48 | 63: iteration 22300/ 24424 | consumed samples: 11417600 | consumed tokens: 23383244800 | elapsed time per iteration (s): 2.23 | learning rate: 2.341E-05 | global batch size: 512 | lm loss: 1.963492E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.660 | TFLOPs: 23.64 | 63: iteration 22310/ 24424 | consumed samples: 11422720 | consumed tokens: 23393730560 | elapsed time per iteration (s): 2.25 | learning rate: 2.338E-05 | global batch size: 512 | lm loss: 1.974971E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.301 | TFLOPs: 23.40 | 63: iteration 22320/ 24424 | consumed samples: 11427840 | consumed tokens: 23404216320 | elapsed time per iteration (s): 2.26 | learning rate: 2.334E-05 | global batch size: 512 | lm loss: 1.980439E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.033 | TFLOPs: 23.37 | 63: iteration 22330/ 24424 | consumed samples: 11432960 | consumed tokens: 23414702080 | elapsed time per iteration (s): 2.25 | learning rate: 2.331E-05 | global batch size: 512 | lm loss: 1.958665E+00 | grad norm: 0.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.405 | TFLOPs: 23.41 | 63: iteration 22340/ 24424 | consumed samples: 11438080 | consumed tokens: 23425187840 | elapsed time per iteration (s): 2.23 | learning rate: 2.328E-05 | global batch size: 512 | lm loss: 1.966817E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.573 | TFLOPs: 23.63 | 63: iteration 22350/ 24424 | consumed samples: 11443200 | consumed tokens: 23435673600 | elapsed time per iteration (s): 2.30 | learning rate: 2.325E-05 | global batch size: 512 | lm loss: 1.984091E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.769 | TFLOPs: 22.93 | 63: iteration 22360/ 24424 | consumed samples: 11448320 | consumed tokens: 23446159360 | elapsed time per iteration (s): 2.28 | learning rate: 2.322E-05 | global batch size: 512 | lm loss: 1.985556E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.729 | TFLOPs: 23.13 | 63: iteration 22370/ 24424 | consumed samples: 11453440 | consumed tokens: 23456645120 | elapsed time per iteration (s): 2.29 | learning rate: 2.319E-05 | global batch size: 512 | lm loss: 1.978095E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.972 | TFLOPs: 23.06 | 63: iteration 22380/ 24424 | consumed samples: 11458560 | consumed tokens: 23467130880 | elapsed time per iteration (s): 2.24 | learning rate: 2.316E-05 | global batch size: 512 | lm loss: 1.980061E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.228 | TFLOPs: 23.50 | 63: iteration 22390/ 24424 | consumed samples: 11463680 | consumed tokens: 23477616640 | elapsed time per iteration (s): 2.24 | learning rate: 2.313E-05 | global batch size: 512 | lm loss: 1.985824E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.501 | TFLOPs: 23.52 | 63: iteration 22400/ 24424 | consumed samples: 11468800 | consumed tokens: 23488102400 | elapsed time per iteration (s): 2.23 | learning rate: 2.310E-05 | global batch size: 512 | lm loss: 1.984606E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.605 | TFLOPs: 23.64 | 63: iteration 22410/ 24424 | consumed samples: 11473920 | consumed tokens: 23498588160 | elapsed time per iteration (s): 4.85 | learning rate: 2.307E-05 | global batch size: 512 | lm loss: 1.975186E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 105.526 | TFLOPs: 10.86 | 63: iteration 22420/ 24424 | consumed samples: 11479040 | consumed tokens: 23509073920 | elapsed time per iteration (s): 2.28 | learning rate: 2.304E-05 | global batch size: 512 | lm loss: 1.972337E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.624 | TFLOPs: 23.12 | 63: iteration 22430/ 24424 | consumed samples: 11484160 | consumed tokens: 23519559680 | elapsed time per iteration (s): 2.25 | learning rate: 2.301E-05 | global batch size: 512 | lm loss: 1.972522E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.721 | TFLOPs: 23.44 | 63: iteration 22440/ 24424 | consumed samples: 11489280 | consumed tokens: 23530045440 | elapsed time per iteration (s): 2.23 | learning rate: 2.298E-05 | global batch size: 512 | lm loss: 1.999432E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.638 | TFLOPs: 23.64 | 63: iteration 22450/ 24424 | consumed samples: 11494400 | consumed tokens: 23540531200 | elapsed time per iteration (s): 2.29 | learning rate: 2.295E-05 | global batch size: 512 | lm loss: 1.988347E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.880 | TFLOPs: 23.05 | 63: iteration 22460/ 24424 | consumed samples: 11499520 | consumed tokens: 23551016960 | elapsed time per iteration (s): 2.23 | learning rate: 2.292E-05 | global batch size: 512 | lm loss: 1.987125E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.679 | TFLOPs: 23.64 | 63: iteration 22470/ 24424 | consumed samples: 11504640 | consumed tokens: 23561502720 | elapsed time per iteration (s): 2.24 | learning rate: 2.289E-05 | global batch size: 512 | lm loss: 1.983844E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.308 | TFLOPs: 23.50 | 63: iteration 22480/ 24424 | consumed samples: 11509760 | consumed tokens: 23571988480 | elapsed time per iteration (s): 2.29 | learning rate: 2.286E-05 | global batch size: 512 | lm loss: 1.980879E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.735 | TFLOPs: 23.03 | 63: iteration 22490/ 24424 | consumed samples: 11514880 | consumed tokens: 23582474240 | elapsed time per iteration (s): 2.24 | learning rate: 2.283E-05 | global batch size: 512 | lm loss: 1.968450E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.270 | TFLOPs: 23.50 | 63: iteration 22500/ 24424 | consumed samples: 11520000 | consumed tokens: 23592960000 | elapsed time per iteration (s): 2.23 | learning rate: 2.280E-05 | global batch size: 512 | lm loss: 1.966533E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.900 | TFLOPs: 23.67 | 63: iteration 22510/ 24424 | consumed samples: 11525120 | consumed tokens: 23603445760 | elapsed time per iteration (s): 2.25 | learning rate: 2.277E-05 | global batch size: 512 | lm loss: 1.971974E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.892 | TFLOPs: 23.46 | 63: iteration 22520/ 24424 | consumed samples: 11530240 | consumed tokens: 23613931520 | elapsed time per iteration (s): 2.25 | learning rate: 2.274E-05 | global batch size: 512 | lm loss: 1.965754E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.077 | TFLOPs: 23.38 | 63: iteration 22530/ 24424 | consumed samples: 11535360 | consumed tokens: 23624417280 | elapsed time per iteration (s): 2.23 | learning rate: 2.271E-05 | global batch size: 512 | lm loss: 1.984113E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.974 | TFLOPs: 23.67 | 63: iteration 22540/ 24424 | consumed samples: 11540480 | consumed tokens: 23634903040 | elapsed time per iteration (s): 2.22 | learning rate: 2.268E-05 | global batch size: 512 | lm loss: 1.966924E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.164 | TFLOPs: 23.69 | 63: iteration 22550/ 24424 | consumed samples: 11545600 | consumed tokens: 23645388800 | elapsed time per iteration (s): 2.25 | learning rate: 2.266E-05 | global batch size: 512 | lm loss: 1.980847E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.779 | TFLOPs: 23.45 | 63: iteration 22560/ 24424 | consumed samples: 11550720 | consumed tokens: 23655874560 | elapsed time per iteration (s): 2.26 | learning rate: 2.263E-05 | global batch size: 512 | lm loss: 1.993733E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.527 | TFLOPs: 23.32 | 63: iteration 22570/ 24424 | consumed samples: 11555840 | consumed tokens: 23666360320 | elapsed time per iteration (s): 2.54 | learning rate: 2.260E-05 | global batch size: 512 | lm loss: 1.987328E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 201.333 | TFLOPs: 20.73 | 63: iteration 22580/ 24424 | consumed samples: 11560960 | consumed tokens: 23676846080 | elapsed time per iteration (s): 2.23 | learning rate: 2.257E-05 | global batch size: 512 | lm loss: 1.982237E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.741 | TFLOPs: 23.65 | 63: iteration 22590/ 24424 | consumed samples: 11566080 | consumed tokens: 23687331840 | elapsed time per iteration (s): 2.22 | learning rate: 2.255E-05 | global batch size: 512 | lm loss: 1.980515E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.142 | TFLOPs: 23.69 | 63: iteration 22600/ 24424 | consumed samples: 11571200 | consumed tokens: 23697817600 | elapsed time per iteration (s): 2.23 | learning rate: 2.252E-05 | global batch size: 512 | lm loss: 1.969036E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.129 | TFLOPs: 23.59 | 63: iteration 22610/ 24424 | consumed samples: 11576320 | consumed tokens: 23708303360 | elapsed time per iteration (s): 2.26 | learning rate: 2.249E-05 | global batch size: 512 | lm loss: 1.974052E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.768 | TFLOPs: 23.34 | 63: iteration 22620/ 24424 | consumed samples: 11581440 | consumed tokens: 23718789120 | elapsed time per iteration (s): 2.29 | learning rate: 2.246E-05 | global batch size: 512 | lm loss: 1.973853E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.169 | TFLOPs: 22.97 | 63: iteration 22630/ 24424 | consumed samples: 11586560 | consumed tokens: 23729274880 | elapsed time per iteration (s): 2.29 | learning rate: 2.244E-05 | global batch size: 512 | lm loss: 1.968929E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.527 | TFLOPs: 23.01 | 63: iteration 22640/ 24424 | consumed samples: 11591680 | consumed tokens: 23739760640 | elapsed time per iteration (s): 2.24 | learning rate: 2.241E-05 | global batch size: 512 | lm loss: 1.979281E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.347 | TFLOPs: 23.51 | 63: iteration 22650/ 24424 | consumed samples: 11596800 | consumed tokens: 23750246400 | elapsed time per iteration (s): 2.27 | learning rate: 2.238E-05 | global batch size: 512 | lm loss: 1.992272E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.047 | TFLOPs: 23.27 | 63: iteration 22660/ 24424 | consumed samples: 11601920 | consumed tokens: 23760732160 | elapsed time per iteration (s): 2.25 | learning rate: 2.236E-05 | global batch size: 512 | lm loss: 1.954644E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.091 | TFLOPs: 23.38 | 63: iteration 22670/ 24424 | consumed samples: 11607040 | consumed tokens: 23771217920 | elapsed time per iteration (s): 2.23 | learning rate: 2.233E-05 | global batch size: 512 | lm loss: 1.975215E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.127 | TFLOPs: 23.59 | 63: iteration 22680/ 24424 | consumed samples: 11612160 | consumed tokens: 23781703680 | elapsed time per iteration (s): 2.26 | learning rate: 2.230E-05 | global batch size: 512 | lm loss: 1.967940E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.705 | TFLOPs: 23.34 | 63: iteration 22690/ 24424 | consumed samples: 11617280 | consumed tokens: 23792189440 | elapsed time per iteration (s): 2.28 | learning rate: 2.228E-05 | global batch size: 512 | lm loss: 1.977046E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.399 | TFLOPs: 23.10 | 63: iteration 22700/ 24424 | consumed samples: 11622400 | consumed tokens: 23802675200 | elapsed time per iteration (s): 2.27 | learning rate: 2.225E-05 | global batch size: 512 | lm loss: 1.997415E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.832 | TFLOPs: 23.25 | 63: iteration 22710/ 24424 | consumed samples: 11627520 | consumed tokens: 23813160960 | elapsed time per iteration (s): 2.23 | learning rate: 2.222E-05 | global batch size: 512 | lm loss: 1.970769E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.038 | TFLOPs: 23.68 | 63: iteration 22720/ 24424 | consumed samples: 11632640 | consumed tokens: 23823646720 | elapsed time per iteration (s): 2.24 | learning rate: 2.220E-05 | global batch size: 512 | lm loss: 1.967357E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.350 | TFLOPs: 23.51 | 63: iteration 22730/ 24424 | consumed samples: 11637760 | consumed tokens: 23834132480 | elapsed time per iteration (s): 2.23 | learning rate: 2.217E-05 | global batch size: 512 | lm loss: 1.990796E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.137 | TFLOPs: 23.59 | 63: iteration 22740/ 24424 | consumed samples: 11642880 | consumed tokens: 23844618240 | elapsed time per iteration (s): 2.25 | learning rate: 2.215E-05 | global batch size: 512 | lm loss: 1.985559E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.618 | TFLOPs: 23.43 | 63: iteration 22750/ 24424 | consumed samples: 11648000 | consumed tokens: 23855104000 | elapsed time per iteration (s): 2.25 | learning rate: 2.212E-05 | global batch size: 512 | lm loss: 1.975730E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.936 | TFLOPs: 23.46 | 63: iteration 22760/ 24424 | consumed samples: 11653120 | consumed tokens: 23865589760 | elapsed time per iteration (s): 2.26 | learning rate: 2.210E-05 | global batch size: 512 | lm loss: 1.983376E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.774 | TFLOPs: 23.35 | 63: iteration 22770/ 24424 | consumed samples: 11658240 | consumed tokens: 23876075520 | elapsed time per iteration (s): 2.23 | learning rate: 2.207E-05 | global batch size: 512 | lm loss: 1.960869E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.334 | TFLOPs: 23.61 | 63: iteration 22780/ 24424 | consumed samples: 11663360 | consumed tokens: 23886561280 | elapsed time per iteration (s): 2.25 | learning rate: 2.205E-05 | global batch size: 512 | lm loss: 1.985859E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.308 | TFLOPs: 23.40 | 63: iteration 22790/ 24424 | consumed samples: 11668480 | consumed tokens: 23897047040 | elapsed time per iteration (s): 2.23 | learning rate: 2.202E-05 | global batch size: 512 | lm loss: 1.984187E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.322 | TFLOPs: 23.61 | 63: iteration 22800/ 24424 | consumed samples: 11673600 | consumed tokens: 23907532800 | elapsed time per iteration (s): 2.24 | learning rate: 2.200E-05 | global batch size: 512 | lm loss: 1.975024E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.614 | TFLOPs: 23.53 | 63: iteration 22810/ 24424 | consumed samples: 11678720 | consumed tokens: 23918018560 | elapsed time per iteration (s): 2.23 | learning rate: 2.197E-05 | global batch size: 512 | lm loss: 1.958763E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.097 | TFLOPs: 23.58 | 63: iteration 22820/ 24424 | consumed samples: 11683840 | consumed tokens: 23928504320 | elapsed time per iteration (s): 2.24 | learning rate: 2.195E-05 | global batch size: 512 | lm loss: 1.983430E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.847 | TFLOPs: 23.56 | 63: iteration 22830/ 24424 | consumed samples: 11688960 | consumed tokens: 23938990080 | elapsed time per iteration (s): 2.26 | learning rate: 2.192E-05 | global batch size: 512 | lm loss: 1.977243E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.737 | TFLOPs: 23.34 | 63: iteration 22840/ 24424 | consumed samples: 11694080 | consumed tokens: 23949475840 | elapsed time per iteration (s): 2.26 | learning rate: 2.190E-05 | global batch size: 512 | lm loss: 1.963256E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.595 | TFLOPs: 23.33 | 63: iteration 22850/ 24424 | consumed samples: 11699200 | consumed tokens: 23959961600 | elapsed time per iteration (s): 2.24 | learning rate: 2.188E-05 | global batch size: 512 | lm loss: 1.987829E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.062 | TFLOPs: 23.58 | 63: iteration 22860/ 24424 | consumed samples: 11704320 | consumed tokens: 23970447360 | elapsed time per iteration (s): 2.22 | learning rate: 2.185E-05 | global batch size: 512 | lm loss: 1.951118E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.228 | TFLOPs: 23.70 | 63: iteration 22870/ 24424 | consumed samples: 11709440 | consumed tokens: 23980933120 | elapsed time per iteration (s): 2.27 | learning rate: 2.183E-05 | global batch size: 512 | lm loss: 1.955753E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.822 | TFLOPs: 23.25 | 63: iteration 22880/ 24424 | consumed samples: 11714560 | consumed tokens: 23991418880 | elapsed time per iteration (s): 2.23 | learning rate: 2.181E-05 | global batch size: 512 | lm loss: 1.979129E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.205 | TFLOPs: 23.60 | 63: iteration 22890/ 24424 | consumed samples: 11719680 | consumed tokens: 24001904640 | elapsed time per iteration (s): 2.24 | learning rate: 2.178E-05 | global batch size: 512 | lm loss: 1.982137E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.455 | TFLOPs: 23.52 | 63: iteration 22900/ 24424 | consumed samples: 11724800 | consumed tokens: 24012390400 | elapsed time per iteration (s): 2.24 | learning rate: 2.176E-05 | global batch size: 512 | lm loss: 1.996716E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.331 | TFLOPs: 23.51 | 63: iteration 22910/ 24424 | consumed samples: 11729920 | consumed tokens: 24022876160 | elapsed time per iteration (s): 2.25 | learning rate: 2.174E-05 | global batch size: 512 | lm loss: 1.971013E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.513 | TFLOPs: 23.42 | 63: iteration 22920/ 24424 | consumed samples: 11735040 | consumed tokens: 24033361920 | elapsed time per iteration (s): 3.28 | learning rate: 2.171E-05 | global batch size: 512 | lm loss: 1.976727E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 156.203 | TFLOPs: 16.08 | 63: iteration 22930/ 24424 | consumed samples: 11740160 | consumed tokens: 24043847680 | elapsed time per iteration (s): 2.23 | learning rate: 2.169E-05 | global batch size: 512 | lm loss: 1.971786E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.290 | TFLOPs: 23.60 | 63: iteration 22940/ 24424 | consumed samples: 11745280 | consumed tokens: 24054333440 | elapsed time per iteration (s): 2.22 | learning rate: 2.167E-05 | global batch size: 512 | lm loss: 1.980763E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.243 | TFLOPs: 23.70 | 63: iteration 22950/ 24424 | consumed samples: 11750400 | consumed tokens: 24064819200 | elapsed time per iteration (s): 2.25 | learning rate: 2.165E-05 | global batch size: 512 | lm loss: 1.999585E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.026 | TFLOPs: 23.47 | 63: iteration 22960/ 24424 | consumed samples: 11755520 | consumed tokens: 24075304960 | elapsed time per iteration (s): 2.23 | learning rate: 2.162E-05 | global batch size: 512 | lm loss: 1.983749E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.716 | TFLOPs: 23.65 | 63: iteration 22970/ 24424 | consumed samples: 11760640 | consumed tokens: 24085790720 | elapsed time per iteration (s): 2.24 | learning rate: 2.160E-05 | global batch size: 512 | lm loss: 1.976279E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.078 | TFLOPs: 23.48 | 63: iteration 22980/ 24424 | consumed samples: 11765760 | consumed tokens: 24096276480 | elapsed time per iteration (s): 2.25 | learning rate: 2.158E-05 | global batch size: 512 | lm loss: 1.987723E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.049 | TFLOPs: 23.48 | 63: iteration 22990/ 24424 | consumed samples: 11770880 | consumed tokens: 24106762240 | elapsed time per iteration (s): 2.25 | learning rate: 2.156E-05 | global batch size: 512 | lm loss: 1.969308E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.088 | TFLOPs: 23.38 | 63: iteration 23000/ 24424 | consumed samples: 11776000 | consumed tokens: 24117248000 | elapsed time per iteration (s): 2.25 | learning rate: 2.154E-05 | global batch size: 512 | lm loss: 1.987435E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.068 | TFLOPs: 23.38 | 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 23000 | lm loss value: 1.961879E+00 | lm loss PPL: 7.112680E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 23000 to checkpoints_3b9 0: [2022-11-26 08:37:18,282] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step23000 is begin to save! 0: [2022-11-26 08:37:18,304] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_01-model_00-model_states.pt... 32: [2022-11-26 08:37:18,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_21-model_00-model_states.pt... 32: [2022-11-26 08:37:18,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_21-model_00-model_states.pt. 32: [2022-11-26 08:37:18,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_22-model_00-model_states.pt... 0: [2022-11-26 08:37:18,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_01-model_00-model_states.pt. 0: [2022-11-26 08:37:18,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_03-model_00-model_states.pt... 32: [2022-11-26 08:37:18,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_22-model_00-model_states.pt. 32: [2022-11-26 08:37:18,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_23-model_00-model_states.pt... 0: [2022-11-26 08:37:18,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_03-model_00-model_states.pt. 0: [2022-11-26 08:37:18,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_04-model_00-model_states.pt... 32: [2022-11-26 08:37:19,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_23-model_00-model_states.pt. 32: [2022-11-26 08:37:19,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_24-model_00-model_states.pt... 0: [2022-11-26 08:37:19,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_04-model_00-model_states.pt. 0: [2022-11-26 08:37:19,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_05-model_00-model_states.pt... 32: [2022-11-26 08:37:19,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_24-model_00-model_states.pt. 32: [2022-11-26 08:37:19,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_25-model_00-model_states.pt... 0: [2022-11-26 08:37:19,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_05-model_00-model_states.pt. 0: [2022-11-26 08:37:19,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_06-model_00-model_states.pt... 32: [2022-11-26 08:37:19,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_25-model_00-model_states.pt. 32: [2022-11-26 08:37:19,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_26-model_00-model_states.pt... 0: [2022-11-26 08:37:19,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_06-model_00-model_states.pt. 0: [2022-11-26 08:37:19,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_07-model_00-model_states.pt... 32: [2022-11-26 08:37:19,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_26-model_00-model_states.pt. 32: [2022-11-26 08:37:19,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_27-model_00-model_states.pt... 0: [2022-11-26 08:37:19,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_07-model_00-model_states.pt. 0: [2022-11-26 08:37:19,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_08-model_00-model_states.pt... 32: [2022-11-26 08:37:19,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_27-model_00-model_states.pt. 32: [2022-11-26 08:37:19,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_28-model_00-model_states.pt... 0: [2022-11-26 08:37:20,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_08-model_00-model_states.pt. 0: [2022-11-26 08:37:20,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_09-model_00-model_states.pt... 32: [2022-11-26 08:37:20,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_28-model_00-model_states.pt. 32: [2022-11-26 08:37:20,208] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_29-model_00-model_states.pt... 0: [2022-11-26 08:37:20,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_09-model_00-model_states.pt. 0: [2022-11-26 08:37:20,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_10-model_00-model_states.pt... 32: [2022-11-26 08:37:20,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_29-model_00-model_states.pt. 32: [2022-11-26 08:37:20,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_30-model_00-model_states.pt... 0: [2022-11-26 08:37:20,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_10-model_00-model_states.pt. 0: [2022-11-26 08:37:20,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_11-model_00-model_states.pt... 32: [2022-11-26 08:37:20,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_30-model_00-model_states.pt. 32: [2022-11-26 08:37:20,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_31-model_00-model_states.pt... 0: [2022-11-26 08:37:20,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_11-model_00-model_states.pt. 0: [2022-11-26 08:37:20,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_12-model_00-model_states.pt... 32: [2022-11-26 08:37:20,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_31-model_00-model_states.pt. 32: [2022-11-26 08:37:20,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_32-model_00-model_states.pt... 0: [2022-11-26 08:37:21,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_12-model_00-model_states.pt. 0: [2022-11-26 08:37:21,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_13-model_00-model_states.pt... 32: [2022-11-26 08:37:21,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_32-model_00-model_states.pt. 32: [2022-11-26 08:37:21,137] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_33-model_00-model_states.pt... 0: [2022-11-26 08:37:21,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_13-model_00-model_states.pt. 0: [2022-11-26 08:37:21,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_14-model_00-model_states.pt... 32: [2022-11-26 08:37:21,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_33-model_00-model_states.pt. 32: [2022-11-26 08:37:21,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_34-model_00-model_states.pt... 0: [2022-11-26 08:37:21,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_14-model_00-model_states.pt. 0: [2022-11-26 08:37:21,488] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_15-model_00-model_states.pt... 32: [2022-11-26 08:37:21,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_34-model_00-model_states.pt. 32: [2022-11-26 08:37:21,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_35-model_00-model_states.pt... 0: [2022-11-26 08:37:21,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_15-model_00-model_states.pt. 0: [2022-11-26 08:37:21,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_16-model_00-model_states.pt... 32: [2022-11-26 08:37:21,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_35-model_00-model_states.pt. 32: [2022-11-26 08:37:21,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_36-model_00-model_states.pt... 0: [2022-11-26 08:37:21,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_16-model_00-model_states.pt. 0: [2022-11-26 08:37:21,948] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_17-model_00-model_states.pt... 32: [2022-11-26 08:37:22,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_36-model_00-model_states.pt. 32: [2022-11-26 08:37:22,039] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_37-model_00-model_states.pt... 0: [2022-11-26 08:37:22,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_17-model_00-model_states.pt. 0: [2022-11-26 08:37:22,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_18-model_00-model_states.pt... 32: [2022-11-26 08:37:22,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_37-model_00-model_states.pt. 32: [2022-11-26 08:37:22,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_38-model_00-model_states.pt... 0: [2022-11-26 08:37:22,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_18-model_00-model_states.pt. 0: [2022-11-26 08:37:22,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_19-model_00-model_states.pt... 32: [2022-11-26 08:37:22,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_38-model_00-model_states.pt. 32: [2022-11-26 08:37:22,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_40-model_00-model_states.pt... 32: [2022-11-26 08:37:22,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_40-model_00-model_states.pt. 32: [2022-11-26 08:37:22,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/mp_rank_01_model_states.pt... 32: [2022-11-26 08:37:22,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/mp_rank_01_model_states.pt. 0: [2022-11-26 08:37:22,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_19-model_00-model_states.pt. 0: [2022-11-26 08:37:22,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/layer_20-model_00-model_states.pt... 0: [2022-11-26 08:37:22,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/layer_20-model_00-model_states.pt. 0: [2022-11-26 08:37:22,861] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step23000/mp_rank_00_model_states.pt 0: [2022-11-26 08:37:22,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/mp_rank_00_model_states.pt... 0: [2022-11-26 08:37:22,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/mp_rank_00_model_states.pt. 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 61: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 56: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 35: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 44: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 46: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 37: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 38: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 55: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 59: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 63: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 52: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 62: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 33: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 34: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 47: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 49: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 43: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 19: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 15: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 60: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 2: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 21: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 27: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 16: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 4: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 25: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 29: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 1: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 3: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 22: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 6: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 18: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 30: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 20: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 26: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 9: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 31: [2022-11-26 08:37:23,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step23000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 0: [2022-11-26 08:37:23,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 08:37:23,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 08:37:23,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 08:37:23,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 08:37:23,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 08:37:23,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 08:37:23,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 08:37:23,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 08:37:23,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 8: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 56: [2022-11-26 08:37:23,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 13: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 33: [2022-11-26 08:37:23,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 08:37:23,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 22: [2022-11-26 08:37:23,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 08:37:23,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 37: [2022-11-26 08:37:23,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 22: [2022-11-26 08:37:23,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 08:37:23,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 08:37:23,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 32: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 7: [2022-11-26 08:37:23,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 32: [2022-11-26 08:37:23,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 23: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 7: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 08:37:23,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 08:37:23,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 51: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 08:37:23,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 3: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 56: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 28: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 56: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 08:37:23,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 29: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 08:37:23,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 52: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 11: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 11: [2022-11-26 08:37:23,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 52: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 39: [2022-11-26 08:37:23,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 39: [2022-11-26 08:37:23,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 35: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 08:37:23,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 08:37:23,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 44: [2022-11-26 08:37:23,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 49: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 29: [2022-11-26 08:37:23,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 49: [2022-11-26 08:37:23,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 29: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 08:37:23,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 47: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 15: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 51: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 32: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 08:37:23,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 3: [2022-11-26 08:37:23,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 32: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 08:37:23,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 2: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 2: [2022-11-26 08:37:23,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 2: [2022-11-26 08:37:23,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 2: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 38: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 38: [2022-11-26 08:37:23,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 2: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 39: [2022-11-26 08:37:23,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 2: [2022-11-26 08:37:23,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 39: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 08:37:23,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 35: [2022-11-26 08:37:23,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 08:37:23,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 0: [2022-11-26 08:37:23,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 35: [2022-11-26 08:37:23,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 37: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 11: [2022-11-26 08:37:23,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 37: [2022-11-26 08:37:23,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 11: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 08:37:23,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 8: [2022-11-26 08:37:23,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 29: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 46: [2022-11-26 08:37:23,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 29: [2022-11-26 08:37:23,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 46: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 41: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 22: [2022-11-26 08:37:23,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 41: [2022-11-26 08:37:23,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 22: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 26: [2022-11-26 08:37:23,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 58: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 08:37:23,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 08:37:23,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 08:37:23,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 08:37:23,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 08:37:23,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 08:37:23,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 08:37:23,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 36: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 08:37:23,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 08:37:23,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 55: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 29: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 08:37:23,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 3: [2022-11-26 08:37:23,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 08:37:23,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 08:37:23,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 08:37:23,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 08:37:23,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 6: [2022-11-26 08:37:23,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 08:37:23,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 2: [2022-11-26 08:37:23,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 2: [2022-11-26 08:37:23,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 42: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 26: [2022-11-26 08:37:23,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 08:37:23,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 08:37:23,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 08:37:23,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 54: [2022-11-26 08:37:23,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 08:37:23,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 08:37:23,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 08:37:23,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 08:37:23,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 08:37:23,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 08:37:23,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 11: [2022-11-26 08:37:23,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 9: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 9: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 08:37:23,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 08:37:23,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 08:37:23,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 08:37:23,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 08:37:23,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 08:37:23,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 08:37:23,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 08:37:23,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 49: [2022-11-26 08:37:23,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 51: [2022-11-26 08:37:23,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 08:37:23,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 08:37:23,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 08:37:23,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 08:37:23,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 32: [2022-11-26 08:37:23,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 32: [2022-11-26 08:37:23,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 38: [2022-11-26 08:37:23,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 41: [2022-11-26 08:37:23,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 24: [2022-11-26 08:37:23,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 08:37:23,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 08:37:23,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 08:37:23,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 08:37:23,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 19: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 54: [2022-11-26 08:37:23,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 56: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 16: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 60: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 2: [2022-11-26 08:37:23,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 26: [2022-11-26 08:37:23,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 11: [2022-11-26 08:37:23,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 23: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 29: [2022-11-26 08:37:23,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 33: [2022-11-26 08:37:23,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 1: [2022-11-26 08:37:23,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 7: [2022-11-26 08:37:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 3: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 35: [2022-11-26 08:37:23,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 22: [2022-11-26 08:37:23,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 12: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 24: [2022-11-26 08:37:23,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 28: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 34: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 6: [2022-11-26 08:37:23,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 32: [2022-11-26 08:37:23,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 42: [2022-11-26 08:37:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 46: [2022-11-26 08:37:23,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 36: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 37: [2022-11-26 08:37:23,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 49: [2022-11-26 08:37:23,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 39: [2022-11-26 08:37:23,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 38: [2022-11-26 08:37:23,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 19: [2022-11-26 08:37:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 15: [2022-11-26 08:37:23,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 54: [2022-11-26 08:37:23,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 56: [2022-11-26 08:37:23,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 60: [2022-11-26 08:37:23,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 2: [2022-11-26 08:37:23,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 10: [2022-11-26 08:37:23,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 0: [2022-11-26 08:37:23,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 26: [2022-11-26 08:37:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 11: [2022-11-26 08:37:23,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 29: [2022-11-26 08:37:23,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 33: [2022-11-26 08:37:23,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 1: [2022-11-26 08:37:23,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 35: [2022-11-26 08:37:23,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 22: [2022-11-26 08:37:23,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 30: [2022-11-26 08:37:23,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 34: [2022-11-26 08:37:23,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 6: [2022-11-26 08:37:23,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 32: [2022-11-26 08:37:23,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 37: [2022-11-26 08:37:23,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 39: [2022-11-26 08:37:23,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 16: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 17: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 9: [2022-11-26 08:37:23,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 15: [2022-11-26 08:37:23,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 62: [2022-11-26 08:37:23,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 16: [2022-11-26 08:37:23,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 60: [2022-11-26 08:37:23,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 2: [2022-11-26 08:37:23,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 26: [2022-11-26 08:37:23,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 29: [2022-11-26 08:37:23,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 12: [2022-11-26 08:37:23,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 34: [2022-11-26 08:37:23,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 6: [2022-11-26 08:37:23,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 46: [2022-11-26 08:37:23,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 36: [2022-11-26 08:37:23,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 15: [2022-11-26 08:37:23,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 62: [2022-11-26 08:37:23,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 60: [2022-11-26 08:37:23,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 29: [2022-11-26 08:37:23,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 30: [2022-11-26 08:37:23,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 34: [2022-11-26 08:37:23,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 9: [2022-11-26 08:37:23,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 46: [2022-11-26 08:37:23,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 26: [2022-11-26 08:37:23,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 08:37:23,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 08:37:23,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 08:37:23,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 08:37:23,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 08:37:23,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 08:37:23,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 08:37:23,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 37: [2022-11-26 08:37:23,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 08:37:23,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 08:37:23,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 08:37:23,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 34: [2022-11-26 08:37:23,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 54: [2022-11-26 08:37:23,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 34: [2022-11-26 08:37:23,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 08:37:23,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 08:37:23,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 49: [2022-11-26 08:37:23,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 22: [2022-11-26 08:37:23,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 08:37:23,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 08:37:23,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 08:37:23,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 32: [2022-11-26 08:37:23,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 08:37:23,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 08:37:23,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 08:37:23,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,370] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 08:37:23,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 12: [2022-11-26 08:37:23,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 56: [2022-11-26 08:37:23,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 12: [2022-11-26 08:37:23,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 14: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 35: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 17: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 17: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 08:37:23,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 08:37:23,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 08:37:23,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 08:37:23,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 26: [2022-11-26 08:37:23,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 08:37:23,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 08:37:23,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 2: [2022-11-26 08:37:23,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 08:37:23,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 08:37:23,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 7: [2022-11-26 08:37:23,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 61: [2022-11-26 08:37:23,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 7: [2022-11-26 08:37:23,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 08:37:23,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 08:37:23,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 08:37:23,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 23: [2022-11-26 08:37:23,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 15: [2022-11-26 08:37:23,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 08:37:23,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 23: [2022-11-26 08:37:23,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 08:37:23,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 08:37:23,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 08:37:23,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 08:37:23,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 08:37:23,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 9: [2022-11-26 08:37:23,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 08:37:23,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 11: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 4: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 45: [2022-11-26 08:37:23,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 08:37:23,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 08:37:23,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 08:37:23,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 08:37:23,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 08:37:23,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 10: [2022-11-26 08:37:23,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 57: [2022-11-26 08:37:23,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 28: [2022-11-26 08:37:23,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 08:37:23,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 08:37:23,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 54: [2022-11-26 08:37:23,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 34: [2022-11-26 08:37:23,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 08:37:23,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 08:37:23,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 08:37:23,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 37: [2022-11-26 08:37:23,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 37: [2022-11-26 08:37:23,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 08:37:23,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 08:37:23,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 08:37:23,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 24: [2022-11-26 08:37:23,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 32: [2022-11-26 08:37:23,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 14: [2022-11-26 08:37:23,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 32: [2022-11-26 08:37:23,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 08:37:23,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 12: [2022-11-26 08:37:23,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 56: [2022-11-26 08:37:23,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 50: [2022-11-26 08:37:23,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 50: [2022-11-26 08:37:23,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 13: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 25: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 13: [2022-11-26 08:37:23,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 0: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 13: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 25: [2022-11-26 08:37:23,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 57: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 38: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 57: [2022-11-26 08:37:23,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 19: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 10: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 19: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 49: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 19: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 10: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 42: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 49: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 10: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 49: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 42: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 45: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 2: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 42: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 45: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 2: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 18: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 15: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 34: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 15: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 11: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 34: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 11: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 4: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 11: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 4: [2022-11-26 08:37:23,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 33: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 46: [2022-11-26 08:37:23,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 4: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 46: [2022-11-26 08:37:23,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 29: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 33: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 46: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 29: [2022-11-26 08:37:23,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 39: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 26: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 39: [2022-11-26 08:37:23,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 26: [2022-11-26 08:37:23,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 39: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 26: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 31: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 08:37:23,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 38: [2022-11-26 08:37:23,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 6: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 36: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 6: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 3: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 37: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 3: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 21: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 3: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 37: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 37: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 21: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 55: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 21: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 1: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 35: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 9: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 1: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 9: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 1: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 9: [2022-11-26 08:37:23,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 30: [2022-11-26 08:37:23,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 30: [2022-11-26 08:37:23,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 51: [2022-11-26 08:37:23,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 30: [2022-11-26 08:37:23,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 51: [2022-11-26 08:37:23,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 43: [2022-11-26 08:37:23,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 08:37:23,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 08:37:23,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 47: [2022-11-26 08:37:23,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 61: [2022-11-26 08:37:23,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 47: [2022-11-26 08:37:23,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 08:37:23,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 61: [2022-11-26 08:37:23,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 08:37:23,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 7: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 08:37:23,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 59: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 8: [2022-11-26 08:37:23,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 59: [2022-11-26 08:37:23,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 08:37:23,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 08:37:23,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 54: [2022-11-26 08:37:23,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-26 08:37:23,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 08:37:23,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 62: [2022-11-26 08:37:23,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 08:37:23,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 08:37:23,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 28: [2022-11-26 08:37:23,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 08:37:23,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 08:37:23,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 22: [2022-11-26 08:37:23,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 08:37:23,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 08:37:23,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 32: [2022-11-26 08:37:23,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 32: [2022-11-26 08:37:23,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 08:37:23,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 44: [2022-11-26 08:37:23,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 08:37:23,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 08:37:23,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 52: [2022-11-26 08:37:23,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 08:37:23,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 08:37:23,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 48: [2022-11-26 08:37:23,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 08:37:23,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 08:37:23,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 56: [2022-11-26 08:37:23,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 08:37:23,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 08:37:23,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 24: [2022-11-26 08:37:23,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 08:37:23,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 08:37:23,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 55: [2022-11-26 08:37:23,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 08:37:23,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 08:37:23,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 35: [2022-11-26 08:37:23,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 53: [2022-11-26 08:37:23,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 38: [2022-11-26 08:37:23,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 35: [2022-11-26 08:37:23,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 08:37:23,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 53: [2022-11-26 08:37:23,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 08:37:23,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 51: [2022-11-26 08:37:23,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 08:37:23,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 08:37:23,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 38: [2022-11-26 08:37:23,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 08:37:23,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 41: [2022-11-26 08:37:23,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 41: [2022-11-26 08:37:23,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 08:37:23,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 8: [2022-11-26 08:37:23,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 08:37:23,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 08:37:23,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 33: [2022-11-26 08:37:23,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 08:37:23,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 08:37:23,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 58: [2022-11-26 08:37:23,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 08:37:23,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 08:37:23,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 16: [2022-11-26 08:37:23,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 08:37:23,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 08:37:23,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 08:37:23,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 08:37:23,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 27: [2022-11-26 08:37:23,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 36: [2022-11-26 08:37:23,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 08:37:23,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 08:37:23,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 5: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 34: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 17: [2022-11-26 08:37:23,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 08:37:23,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 08:37:23,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 16: [2022-11-26 08:37:23,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 08:37:23,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 08:37:23,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 34: [2022-11-26 08:37:23,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 08:37:23,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 08:37:23,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 20: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 20: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 20: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 27: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 08:37:23,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 08:37:23,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 12: [2022-11-26 08:37:23,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 08:37:23,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 08:37:23,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 63: [2022-11-26 08:37:23,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 08:37:23,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 08:37:23,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 5: [2022-11-26 08:37:23,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 08:37:23,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 08:37:23,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 17: [2022-11-26 08:37:23,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 34: [2022-11-26 08:37:23,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 08:37:23,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 08:37:23,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 14: [2022-11-26 08:37:23,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 08:37:23,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 08:37:23,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 60: [2022-11-26 08:37:23,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 08:37:23,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 08:37:23,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 60: [2022-11-26 08:37:23,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 08:37:23,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 08:37:23,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 17: [2022-11-26 08:37:23,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 08:37:23,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 08:37:23,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 40: [2022-11-26 08:37:23,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 08:37:23,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step23000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 08:37:23,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! 0: successfully saved checkpoint at iteration 23000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5311.77 63: iteration 23010/ 24424 | consumed samples: 11781120 | consumed tokens: 24127733760 | elapsed time per iteration (s): 2.85 | learning rate: 2.152E-05 | global batch size: 512 | lm loss: 1.963757E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.483 | TFLOPs: 18.48 | 63: iteration 23020/ 24424 | consumed samples: 11786240 | consumed tokens: 24138219520 | elapsed time per iteration (s): 2.23 | learning rate: 2.149E-05 | global batch size: 512 | lm loss: 1.978846E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.193 | TFLOPs: 23.59 | 63: iteration 23030/ 24424 | consumed samples: 11791360 | consumed tokens: 24148705280 | elapsed time per iteration (s): 2.27 | learning rate: 2.147E-05 | global batch size: 512 | lm loss: 1.983200E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.936 | TFLOPs: 23.26 | 63: iteration 23040/ 24424 | consumed samples: 11796480 | consumed tokens: 24159191040 | elapsed time per iteration (s): 2.25 | learning rate: 2.145E-05 | global batch size: 512 | lm loss: 1.959303E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.524 | TFLOPs: 23.42 | 63: iteration 23050/ 24424 | consumed samples: 11801600 | consumed tokens: 24169676800 | elapsed time per iteration (s): 2.34 | learning rate: 2.143E-05 | global batch size: 512 | lm loss: 1.975393E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.759 | TFLOPs: 22.52 | 63: iteration 23060/ 24424 | consumed samples: 11806720 | consumed tokens: 24180162560 | elapsed time per iteration (s): 2.27 | learning rate: 2.141E-05 | global batch size: 512 | lm loss: 1.981488E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.937 | TFLOPs: 23.26 | 63: iteration 23070/ 24424 | consumed samples: 11811840 | consumed tokens: 24190648320 | elapsed time per iteration (s): 3.93 | learning rate: 2.139E-05 | global batch size: 512 | lm loss: 1.985740E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 130.314 | TFLOPs: 13.42 | 63: iteration 23080/ 24424 | consumed samples: 11816960 | consumed tokens: 24201134080 | elapsed time per iteration (s): 2.25 | learning rate: 2.137E-05 | global batch size: 512 | lm loss: 1.972748E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.084 | TFLOPs: 23.38 | 63: iteration 23090/ 24424 | consumed samples: 11822080 | consumed tokens: 24211619840 | elapsed time per iteration (s): 2.29 | learning rate: 2.135E-05 | global batch size: 512 | lm loss: 1.963695E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.931 | TFLOPs: 23.05 | 63: iteration 23100/ 24424 | consumed samples: 11827200 | consumed tokens: 24222105600 | elapsed time per iteration (s): 2.26 | learning rate: 2.133E-05 | global batch size: 512 | lm loss: 1.970056E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.897 | TFLOPs: 23.36 | 63: iteration 23110/ 24424 | consumed samples: 11832320 | consumed tokens: 24232591360 | elapsed time per iteration (s): 2.23 | learning rate: 2.131E-05 | global batch size: 512 | lm loss: 1.992321E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.303 | TFLOPs: 23.61 | 63: iteration 23120/ 24424 | consumed samples: 11837440 | consumed tokens: 24243077120 | elapsed time per iteration (s): 2.23 | learning rate: 2.129E-05 | global batch size: 512 | lm loss: 1.978156E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.528 | TFLOPs: 23.63 | 63: iteration 23130/ 24424 | consumed samples: 11842560 | consumed tokens: 24253562880 | elapsed time per iteration (s): 2.24 | learning rate: 2.127E-05 | global batch size: 512 | lm loss: 1.966526E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.384 | TFLOPs: 23.51 | 63: iteration 23140/ 24424 | consumed samples: 11847680 | consumed tokens: 24264048640 | elapsed time per iteration (s): 2.25 | learning rate: 2.125E-05 | global batch size: 512 | lm loss: 1.961495E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.793 | TFLOPs: 23.45 | 63: iteration 23150/ 24424 | consumed samples: 11852800 | consumed tokens: 24274534400 | elapsed time per iteration (s): 2.27 | learning rate: 2.123E-05 | global batch size: 512 | lm loss: 1.956131E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.043 | TFLOPs: 23.27 | 63: iteration 23160/ 24424 | consumed samples: 11857920 | consumed tokens: 24285020160 | elapsed time per iteration (s): 2.23 | learning rate: 2.121E-05 | global batch size: 512 | lm loss: 1.958671E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.046 | TFLOPs: 23.68 | 63: iteration 23170/ 24424 | consumed samples: 11863040 | consumed tokens: 24295505920 | elapsed time per iteration (s): 2.28 | learning rate: 2.119E-05 | global batch size: 512 | lm loss: 1.973967E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.656 | TFLOPs: 23.13 | 63: iteration 23180/ 24424 | consumed samples: 11868160 | consumed tokens: 24305991680 | elapsed time per iteration (s): 2.23 | learning rate: 2.117E-05 | global batch size: 512 | lm loss: 1.983965E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.230 | TFLOPs: 23.60 | 63: iteration 23190/ 24424 | consumed samples: 11873280 | consumed tokens: 24316477440 | elapsed time per iteration (s): 2.23 | learning rate: 2.116E-05 | global batch size: 512 | lm loss: 1.968025E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.422 | TFLOPs: 23.62 | 63: iteration 23200/ 24424 | consumed samples: 11878400 | consumed tokens: 24326963200 | elapsed time per iteration (s): 2.26 | learning rate: 2.114E-05 | global batch size: 512 | lm loss: 1.984479E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.749 | TFLOPs: 23.34 | 63: iteration 23210/ 24424 | consumed samples: 11883520 | consumed tokens: 24337448960 | elapsed time per iteration (s): 2.24 | learning rate: 2.112E-05 | global batch size: 512 | lm loss: 1.975559E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.906 | TFLOPs: 23.56 | 63: iteration 23220/ 24424 | consumed samples: 11888640 | consumed tokens: 24347934720 | elapsed time per iteration (s): 2.26 | learning rate: 2.110E-05 | global batch size: 512 | lm loss: 1.991330E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.076 | TFLOPs: 23.27 | 63: iteration 23230/ 24424 | consumed samples: 11893760 | consumed tokens: 24358420480 | elapsed time per iteration (s): 2.31 | learning rate: 2.108E-05 | global batch size: 512 | lm loss: 1.984363E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.992 | TFLOPs: 22.85 | 63: iteration 23240/ 24424 | consumed samples: 11898880 | consumed tokens: 24368906240 | elapsed time per iteration (s): 2.24 | learning rate: 2.106E-05 | global batch size: 512 | lm loss: 2.000327E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.074 | TFLOPs: 23.48 | 63: iteration 23250/ 24424 | consumed samples: 11904000 | consumed tokens: 24379392000 | elapsed time per iteration (s): 2.25 | learning rate: 2.105E-05 | global batch size: 512 | lm loss: 1.986659E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.154 | TFLOPs: 23.38 | 63: iteration 23260/ 24424 | consumed samples: 11909120 | consumed tokens: 24389877760 | elapsed time per iteration (s): 2.26 | learning rate: 2.103E-05 | global batch size: 512 | lm loss: 1.987580E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.493 | TFLOPs: 23.32 | 63: iteration 23270/ 24424 | consumed samples: 11914240 | consumed tokens: 24400363520 | elapsed time per iteration (s): 2.23 | learning rate: 2.101E-05 | global batch size: 512 | lm loss: 1.994224E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.593 | TFLOPs: 23.64 | 63: iteration 23280/ 24424 | consumed samples: 11919360 | consumed tokens: 24410849280 | elapsed time per iteration (s): 2.25 | learning rate: 2.099E-05 | global batch size: 512 | lm loss: 1.970597E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.779 | TFLOPs: 23.45 | 63: iteration 23290/ 24424 | consumed samples: 11924480 | consumed tokens: 24421335040 | elapsed time per iteration (s): 2.26 | learning rate: 2.098E-05 | global batch size: 512 | lm loss: 1.984788E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.908 | TFLOPs: 23.36 | 63: iteration 23300/ 24424 | consumed samples: 11929600 | consumed tokens: 24431820800 | elapsed time per iteration (s): 2.28 | learning rate: 2.096E-05 | global batch size: 512 | lm loss: 1.975328E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.558 | TFLOPs: 23.12 | 63: iteration 23310/ 24424 | consumed samples: 11934720 | consumed tokens: 24442306560 | elapsed time per iteration (s): 2.26 | learning rate: 2.094E-05 | global batch size: 512 | lm loss: 1.970122E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.246 | TFLOPs: 23.29 | 63: iteration 23320/ 24424 | consumed samples: 11939840 | consumed tokens: 24452792320 | elapsed time per iteration (s): 2.23 | learning rate: 2.093E-05 | global batch size: 512 | lm loss: 1.984249E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.102 | TFLOPs: 23.69 | 63: iteration 23330/ 24424 | consumed samples: 11944960 | consumed tokens: 24463278080 | elapsed time per iteration (s): 2.23 | learning rate: 2.091E-05 | global batch size: 512 | lm loss: 1.960690E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.272 | TFLOPs: 23.60 | 63: iteration 23340/ 24424 | consumed samples: 11950080 | consumed tokens: 24473763840 | elapsed time per iteration (s): 2.25 | learning rate: 2.089E-05 | global batch size: 512 | lm loss: 1.958031E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.462 | TFLOPs: 23.42 | 63: iteration 23350/ 24424 | consumed samples: 11955200 | consumed tokens: 24484249600 | elapsed time per iteration (s): 2.30 | learning rate: 2.088E-05 | global batch size: 512 | lm loss: 1.978046E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.989 | TFLOPs: 22.96 | 63: iteration 23360/ 24424 | consumed samples: 11960320 | consumed tokens: 24494735360 | elapsed time per iteration (s): 2.42 | learning rate: 2.086E-05 | global batch size: 512 | lm loss: 1.953701E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 211.765 | TFLOPs: 21.80 | 63: iteration 23370/ 24424 | consumed samples: 11965440 | consumed tokens: 24505221120 | elapsed time per iteration (s): 2.30 | learning rate: 2.084E-05 | global batch size: 512 | lm loss: 1.976731E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 222.950 | TFLOPs: 22.95 | 63: iteration 23380/ 24424 | consumed samples: 11970560 | consumed tokens: 24515706880 | elapsed time per iteration (s): 3.56 | learning rate: 2.083E-05 | global batch size: 512 | lm loss: 1.957521E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 144.012 | TFLOPs: 14.83 | 63: iteration 23390/ 24424 | consumed samples: 11975680 | consumed tokens: 24526192640 | elapsed time per iteration (s): 2.25 | learning rate: 2.081E-05 | global batch size: 512 | lm loss: 1.982607E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.784 | TFLOPs: 23.45 | 63: iteration 23400/ 24424 | consumed samples: 11980800 | consumed tokens: 24536678400 | elapsed time per iteration (s): 2.24 | learning rate: 2.080E-05 | global batch size: 512 | lm loss: 1.971441E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.325 | TFLOPs: 23.50 | 63: iteration 23410/ 24424 | consumed samples: 11985920 | consumed tokens: 24547164160 | elapsed time per iteration (s): 2.29 | learning rate: 2.078E-05 | global batch size: 512 | lm loss: 1.986524E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.468 | TFLOPs: 23.00 | 63: iteration 23420/ 24424 | consumed samples: 11991040 | consumed tokens: 24557649920 | elapsed time per iteration (s): 2.26 | learning rate: 2.077E-05 | global batch size: 512 | lm loss: 1.976444E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.807 | TFLOPs: 23.35 | 63: iteration 23430/ 24424 | consumed samples: 11996160 | consumed tokens: 24568135680 | elapsed time per iteration (s): 2.25 | learning rate: 2.075E-05 | global batch size: 512 | lm loss: 1.976819E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.919 | TFLOPs: 23.46 | 63: iteration 23440/ 24424 | consumed samples: 12001280 | consumed tokens: 24578621440 | elapsed time per iteration (s): 2.23 | learning rate: 2.074E-05 | global batch size: 512 | lm loss: 1.984949E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.887 | TFLOPs: 23.67 | 63: iteration 23450/ 24424 | consumed samples: 12006400 | consumed tokens: 24589107200 | elapsed time per iteration (s): 2.24 | learning rate: 2.072E-05 | global batch size: 512 | lm loss: 1.969520E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.601 | TFLOPs: 23.53 | 63: iteration 23460/ 24424 | consumed samples: 12011520 | consumed tokens: 24599592960 | elapsed time per iteration (s): 2.23 | learning rate: 2.071E-05 | global batch size: 512 | lm loss: 2.002122E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.862 | TFLOPs: 23.66 | 63: iteration 23470/ 24424 | consumed samples: 12016640 | consumed tokens: 24610078720 | elapsed time per iteration (s): 2.26 | learning rate: 2.069E-05 | global batch size: 512 | lm loss: 1.980006E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.171 | TFLOPs: 23.28 | 63: iteration 23480/ 24424 | consumed samples: 12021760 | consumed tokens: 24620564480 | elapsed time per iteration (s): 2.24 | learning rate: 2.068E-05 | global batch size: 512 | lm loss: 1.967442E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.125 | TFLOPs: 23.48 | 63: iteration 23490/ 24424 | consumed samples: 12026880 | consumed tokens: 24631050240 | elapsed time per iteration (s): 2.26 | learning rate: 2.066E-05 | global batch size: 512 | lm loss: 1.996934E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.983 | TFLOPs: 23.37 | 63: iteration 23500/ 24424 | consumed samples: 12032000 | consumed tokens: 24641536000 | elapsed time per iteration (s): 2.23 | learning rate: 2.065E-05 | global batch size: 512 | lm loss: 1.983427E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.008 | TFLOPs: 23.68 | 63: iteration 23510/ 24424 | consumed samples: 12037120 | consumed tokens: 24652021760 | elapsed time per iteration (s): 2.28 | learning rate: 2.063E-05 | global batch size: 512 | lm loss: 1.975865E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.954 | TFLOPs: 23.16 | 63: iteration 23520/ 24424 | consumed samples: 12042240 | consumed tokens: 24662507520 | elapsed time per iteration (s): 2.26 | learning rate: 2.062E-05 | global batch size: 512 | lm loss: 1.976427E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.695 | TFLOPs: 23.34 | 63: iteration 23530/ 24424 | consumed samples: 12047360 | consumed tokens: 24672993280 | elapsed time per iteration (s): 2.25 | learning rate: 2.061E-05 | global batch size: 512 | lm loss: 1.980670E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.038 | TFLOPs: 23.48 | 63: iteration 23540/ 24424 | consumed samples: 12052480 | consumed tokens: 24683479040 | elapsed time per iteration (s): 2.34 | learning rate: 2.059E-05 | global batch size: 512 | lm loss: 1.974585E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 218.805 | TFLOPs: 22.52 | 63: iteration 23550/ 24424 | consumed samples: 12057600 | consumed tokens: 24693964800 | elapsed time per iteration (s): 2.28 | learning rate: 2.058E-05 | global batch size: 512 | lm loss: 1.956989E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.052 | TFLOPs: 23.17 | 63: iteration 23560/ 24424 | consumed samples: 12062720 | consumed tokens: 24704450560 | elapsed time per iteration (s): 2.24 | learning rate: 2.057E-05 | global batch size: 512 | lm loss: 1.979220E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.841 | TFLOPs: 23.56 | 63: iteration 23570/ 24424 | consumed samples: 12067840 | consumed tokens: 24714936320 | elapsed time per iteration (s): 2.23 | learning rate: 2.055E-05 | global batch size: 512 | lm loss: 1.966831E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.295 | TFLOPs: 23.60 | 63: iteration 23580/ 24424 | consumed samples: 12072960 | consumed tokens: 24725422080 | elapsed time per iteration (s): 2.24 | learning rate: 2.054E-05 | global batch size: 512 | lm loss: 1.978752E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.657 | TFLOPs: 23.54 | 63: iteration 23590/ 24424 | consumed samples: 12078080 | consumed tokens: 24735907840 | elapsed time per iteration (s): 2.23 | learning rate: 2.053E-05 | global batch size: 512 | lm loss: 1.984603E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.703 | TFLOPs: 23.65 | 63: iteration 23600/ 24424 | consumed samples: 12083200 | consumed tokens: 24746393600 | elapsed time per iteration (s): 2.25 | learning rate: 2.052E-05 | global batch size: 512 | lm loss: 1.978761E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.585 | TFLOPs: 23.43 | 63: iteration 23610/ 24424 | consumed samples: 12088320 | consumed tokens: 24756879360 | elapsed time per iteration (s): 2.26 | learning rate: 2.050E-05 | global batch size: 512 | lm loss: 1.983083E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.610 | TFLOPs: 23.33 | 63: iteration 23620/ 24424 | consumed samples: 12093440 | consumed tokens: 24767365120 | elapsed time per iteration (s): 2.23 | learning rate: 2.049E-05 | global batch size: 512 | lm loss: 1.949734E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.306 | TFLOPs: 23.61 | 63: iteration 23630/ 24424 | consumed samples: 12098560 | consumed tokens: 24777850880 | elapsed time per iteration (s): 2.29 | learning rate: 2.048E-05 | global batch size: 512 | lm loss: 1.973593E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.025 | TFLOPs: 23.06 | 63: iteration 23640/ 24424 | consumed samples: 12103680 | consumed tokens: 24788336640 | elapsed time per iteration (s): 2.29 | learning rate: 2.047E-05 | global batch size: 512 | lm loss: 1.959103E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.117 | TFLOPs: 22.97 | 63: iteration 23650/ 24424 | consumed samples: 12108800 | consumed tokens: 24798822400 | elapsed time per iteration (s): 2.23 | learning rate: 2.046E-05 | global batch size: 512 | lm loss: 1.958253E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.137 | TFLOPs: 23.59 | 63: iteration 23660/ 24424 | consumed samples: 12113920 | consumed tokens: 24809308160 | elapsed time per iteration (s): 2.29 | learning rate: 2.044E-05 | global batch size: 512 | lm loss: 1.985115E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.659 | TFLOPs: 23.02 | 63: iteration 23670/ 24424 | consumed samples: 12119040 | consumed tokens: 24819793920 | elapsed time per iteration (s): 2.31 | learning rate: 2.043E-05 | global batch size: 512 | lm loss: 1.985402E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.768 | TFLOPs: 22.83 | 63: iteration 23680/ 24424 | consumed samples: 12124160 | consumed tokens: 24830279680 | elapsed time per iteration (s): 2.27 | learning rate: 2.042E-05 | global batch size: 512 | lm loss: 1.980646E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.034 | TFLOPs: 23.27 | 63: iteration 23690/ 24424 | consumed samples: 12129280 | consumed tokens: 24840765440 | elapsed time per iteration (s): 2.25 | learning rate: 2.041E-05 | global batch size: 512 | lm loss: 1.953419E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.109 | TFLOPs: 23.38 | 63: iteration 23700/ 24424 | consumed samples: 12134400 | consumed tokens: 24851251200 | elapsed time per iteration (s): 2.25 | learning rate: 2.040E-05 | global batch size: 512 | lm loss: 1.976796E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.349 | TFLOPs: 23.40 | 63: iteration 23710/ 24424 | consumed samples: 12139520 | consumed tokens: 24861736960 | elapsed time per iteration (s): 2.32 | learning rate: 2.039E-05 | global batch size: 512 | lm loss: 1.974233E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 220.401 | TFLOPs: 22.69 | 63: iteration 23720/ 24424 | consumed samples: 12144640 | consumed tokens: 24872222720 | elapsed time per iteration (s): 2.29 | learning rate: 2.038E-05 | global batch size: 512 | lm loss: 1.966824E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.228 | TFLOPs: 22.98 | 63: iteration 23730/ 24424 | consumed samples: 12149760 | consumed tokens: 24882708480 | elapsed time per iteration (s): 2.49 | learning rate: 2.037E-05 | global batch size: 512 | lm loss: 1.970314E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 205.648 | TFLOPs: 21.17 | 63: iteration 23740/ 24424 | consumed samples: 12154880 | consumed tokens: 24893194240 | elapsed time per iteration (s): 2.31 | learning rate: 2.036E-05 | global batch size: 512 | lm loss: 1.987291E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.907 | TFLOPs: 22.84 | 63: iteration 23750/ 24424 | consumed samples: 12160000 | consumed tokens: 24903680000 | elapsed time per iteration (s): 2.25 | learning rate: 2.035E-05 | global batch size: 512 | lm loss: 1.970032E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.478 | TFLOPs: 23.42 | 63: iteration 23760/ 24424 | consumed samples: 12165120 | consumed tokens: 24914165760 | elapsed time per iteration (s): 2.23 | learning rate: 2.034E-05 | global batch size: 512 | lm loss: 1.964457E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.887 | TFLOPs: 23.67 | 63: iteration 23770/ 24424 | consumed samples: 12170240 | consumed tokens: 24924651520 | elapsed time per iteration (s): 2.23 | learning rate: 2.033E-05 | global batch size: 512 | lm loss: 1.978876E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.056 | TFLOPs: 23.68 | 63: iteration 23780/ 24424 | consumed samples: 12175360 | consumed tokens: 24935137280 | elapsed time per iteration (s): 2.25 | learning rate: 2.032E-05 | global batch size: 512 | lm loss: 1.983485E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.365 | TFLOPs: 23.41 | 63: iteration 23790/ 24424 | consumed samples: 12180480 | consumed tokens: 24945623040 | elapsed time per iteration (s): 2.25 | learning rate: 2.031E-05 | global batch size: 512 | lm loss: 1.965192E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.072 | TFLOPs: 23.38 | 63: iteration 23800/ 24424 | consumed samples: 12185600 | consumed tokens: 24956108800 | elapsed time per iteration (s): 2.25 | learning rate: 2.030E-05 | global batch size: 512 | lm loss: 1.976315E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.133 | TFLOPs: 23.38 | 63: iteration 23810/ 24424 | consumed samples: 12190720 | consumed tokens: 24966594560 | elapsed time per iteration (s): 2.25 | learning rate: 2.029E-05 | global batch size: 512 | lm loss: 1.965884E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.249 | TFLOPs: 23.39 | 63: iteration 23820/ 24424 | consumed samples: 12195840 | consumed tokens: 24977080320 | elapsed time per iteration (s): 2.26 | learning rate: 2.028E-05 | global batch size: 512 | lm loss: 1.966348E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.021 | TFLOPs: 23.37 | 63: iteration 23830/ 24424 | consumed samples: 12200960 | consumed tokens: 24987566080 | elapsed time per iteration (s): 2.40 | learning rate: 2.027E-05 | global batch size: 512 | lm loss: 1.971701E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 213.711 | TFLOPs: 22.00 | 63: iteration 23840/ 24424 | consumed samples: 12206080 | consumed tokens: 24998051840 | elapsed time per iteration (s): 2.24 | learning rate: 2.026E-05 | global batch size: 512 | lm loss: 1.987284E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.459 | TFLOPs: 23.52 | 63: iteration 23850/ 24424 | consumed samples: 12211200 | consumed tokens: 25008537600 | elapsed time per iteration (s): 3.78 | learning rate: 2.025E-05 | global batch size: 512 | lm loss: 1.971436E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 135.492 | TFLOPs: 13.95 | 63: iteration 23860/ 24424 | consumed samples: 12216320 | consumed tokens: 25019023360 | elapsed time per iteration (s): 2.28 | learning rate: 2.024E-05 | global batch size: 512 | lm loss: 1.973341E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.073 | TFLOPs: 23.07 | 63: iteration 23870/ 24424 | consumed samples: 12221440 | consumed tokens: 25029509120 | elapsed time per iteration (s): 2.24 | learning rate: 2.023E-05 | global batch size: 512 | lm loss: 1.947033E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.735 | TFLOPs: 23.55 | 63: iteration 23880/ 24424 | consumed samples: 12226560 | consumed tokens: 25039994880 | elapsed time per iteration (s): 2.24 | learning rate: 2.023E-05 | global batch size: 512 | lm loss: 1.951915E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.267 | TFLOPs: 23.50 | 63: iteration 23890/ 24424 | consumed samples: 12231680 | consumed tokens: 25050480640 | elapsed time per iteration (s): 2.28 | learning rate: 2.022E-05 | global batch size: 512 | lm loss: 1.976850E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.515 | TFLOPs: 23.11 | 63: iteration 23900/ 24424 | consumed samples: 12236800 | consumed tokens: 25060966400 | elapsed time per iteration (s): 2.28 | learning rate: 2.021E-05 | global batch size: 512 | lm loss: 1.983852E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.044 | TFLOPs: 23.17 | 63: iteration 23910/ 24424 | consumed samples: 12241920 | consumed tokens: 25071452160 | elapsed time per iteration (s): 2.27 | learning rate: 2.020E-05 | global batch size: 512 | lm loss: 1.957214E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.873 | TFLOPs: 23.25 | 63: iteration 23920/ 24424 | consumed samples: 12247040 | consumed tokens: 25081937920 | elapsed time per iteration (s): 2.25 | learning rate: 2.019E-05 | global batch size: 512 | lm loss: 1.981234E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.206 | TFLOPs: 23.39 | 63: iteration 23930/ 24424 | consumed samples: 12252160 | consumed tokens: 25092423680 | elapsed time per iteration (s): 2.24 | learning rate: 2.019E-05 | global batch size: 512 | lm loss: 1.966849E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.679 | TFLOPs: 23.54 | 63: iteration 23940/ 24424 | consumed samples: 12257280 | consumed tokens: 25102909440 | elapsed time per iteration (s): 2.28 | learning rate: 2.018E-05 | global batch size: 512 | lm loss: 1.981041E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.000 | TFLOPs: 23.16 | 63: iteration 23950/ 24424 | consumed samples: 12262400 | consumed tokens: 25113395200 | elapsed time per iteration (s): 2.24 | learning rate: 2.017E-05 | global batch size: 512 | lm loss: 1.969203E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.125 | TFLOPs: 23.48 | 63: iteration 23960/ 24424 | consumed samples: 12267520 | consumed tokens: 25123880960 | elapsed time per iteration (s): 2.23 | learning rate: 2.016E-05 | global batch size: 512 | lm loss: 1.978481E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.863 | TFLOPs: 23.66 | 63: iteration 23970/ 24424 | consumed samples: 12272640 | consumed tokens: 25134366720 | elapsed time per iteration (s): 2.23 | learning rate: 2.016E-05 | global batch size: 512 | lm loss: 1.978794E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.455 | TFLOPs: 23.62 | 63: iteration 23980/ 24424 | consumed samples: 12277760 | consumed tokens: 25144852480 | elapsed time per iteration (s): 2.23 | learning rate: 2.015E-05 | global batch size: 512 | lm loss: 1.969643E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.486 | TFLOPs: 23.62 | 63: iteration 23990/ 24424 | consumed samples: 12282880 | consumed tokens: 25155338240 | elapsed time per iteration (s): 2.42 | learning rate: 2.014E-05 | global batch size: 512 | lm loss: 1.985154E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 211.341 | TFLOPs: 21.76 | 0: [2022-11-26 09:15:52,233] [INFO] [logging.py:68:log_dist] [Rank 0] step=24000, skipped=0, lr=[2.0137020411077448e-05, 2.0137020411077448e-05, 2.0137020411077448e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 24000/ 24424 | consumed samples: 12288000 | consumed tokens: 25165824000 | elapsed time per iteration (s): 2.24 | learning rate: 2.014E-05 | global batch size: 512 | lm loss: 1.972959E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.507 | TFLOPs: 23.52 | 0: steps: 24000 loss: 1.9872 iter time (s): 2.310 samples/sec: 221.632 63: ------------------------------------------------------------------------------------------- 63: valid loss at iteration 24000 | lm loss value: 1.917802E+00 | lm loss PPL: 6.805986E+00 | 63: ------------------------------------------------------------------------------------------- 0: saving checkpoint at iteration 24000 to checkpoints_3b9 0: [2022-11-26 09:15:52,960] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step24000 is begin to save! 0: [2022-11-26 09:15:52,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_01-model_00-model_states.pt... 32: [2022-11-26 09:15:52,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_21-model_00-model_states.pt... 32: [2022-11-26 09:15:53,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_21-model_00-model_states.pt. 32: [2022-11-26 09:15:53,241] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_22-model_00-model_states.pt... 0: [2022-11-26 09:15:53,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_01-model_00-model_states.pt. 0: [2022-11-26 09:15:53,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_03-model_00-model_states.pt... 32: [2022-11-26 09:15:53,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_22-model_00-model_states.pt. 32: [2022-11-26 09:15:53,474] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_23-model_00-model_states.pt... 0: [2022-11-26 09:15:53,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_03-model_00-model_states.pt. 0: [2022-11-26 09:15:53,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_04-model_00-model_states.pt... 32: [2022-11-26 09:15:53,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_23-model_00-model_states.pt. 32: [2022-11-26 09:15:53,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_24-model_00-model_states.pt... 0: [2022-11-26 09:15:53,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_04-model_00-model_states.pt. 0: [2022-11-26 09:15:53,878] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_05-model_00-model_states.pt... 32: [2022-11-26 09:15:53,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_24-model_00-model_states.pt. 32: [2022-11-26 09:15:53,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_25-model_00-model_states.pt... 0: [2022-11-26 09:15:54,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_05-model_00-model_states.pt. 0: [2022-11-26 09:15:54,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_06-model_00-model_states.pt... 32: [2022-11-26 09:15:54,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_25-model_00-model_states.pt. 32: [2022-11-26 09:15:54,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_26-model_00-model_states.pt... 0: [2022-11-26 09:15:54,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_06-model_00-model_states.pt. 0: [2022-11-26 09:15:54,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_07-model_00-model_states.pt... 32: [2022-11-26 09:15:54,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_26-model_00-model_states.pt. 32: [2022-11-26 09:15:54,433] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_27-model_00-model_states.pt... 0: [2022-11-26 09:15:54,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_07-model_00-model_states.pt. 0: [2022-11-26 09:15:54,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_08-model_00-model_states.pt... 32: [2022-11-26 09:15:54,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_27-model_00-model_states.pt. 32: [2022-11-26 09:15:54,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_28-model_00-model_states.pt... 0: [2022-11-26 09:15:54,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_08-model_00-model_states.pt. 0: [2022-11-26 09:15:54,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_09-model_00-model_states.pt... 32: [2022-11-26 09:15:54,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_28-model_00-model_states.pt. 32: [2022-11-26 09:15:54,911] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_29-model_00-model_states.pt... 0: [2022-11-26 09:15:55,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_09-model_00-model_states.pt. 0: [2022-11-26 09:15:55,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_10-model_00-model_states.pt... 32: [2022-11-26 09:15:55,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_29-model_00-model_states.pt. 32: [2022-11-26 09:15:55,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_30-model_00-model_states.pt... 0: [2022-11-26 09:15:55,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_10-model_00-model_states.pt. 0: [2022-11-26 09:15:55,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_11-model_00-model_states.pt... 32: [2022-11-26 09:15:55,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_30-model_00-model_states.pt. 32: [2022-11-26 09:15:55,376] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_31-model_00-model_states.pt... 0: [2022-11-26 09:15:55,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_11-model_00-model_states.pt. 0: [2022-11-26 09:15:55,542] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_12-model_00-model_states.pt... 32: [2022-11-26 09:15:55,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_31-model_00-model_states.pt. 32: [2022-11-26 09:15:55,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_32-model_00-model_states.pt... 0: [2022-11-26 09:15:55,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_12-model_00-model_states.pt. 0: [2022-11-26 09:15:55,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_13-model_00-model_states.pt... 32: [2022-11-26 09:15:55,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_32-model_00-model_states.pt. 32: [2022-11-26 09:15:55,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_33-model_00-model_states.pt... 0: [2022-11-26 09:15:56,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_13-model_00-model_states.pt. 0: [2022-11-26 09:15:56,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_14-model_00-model_states.pt... 32: [2022-11-26 09:15:56,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_33-model_00-model_states.pt. 32: [2022-11-26 09:15:56,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_34-model_00-model_states.pt... 0: [2022-11-26 09:15:56,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_14-model_00-model_states.pt. 0: [2022-11-26 09:15:56,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_15-model_00-model_states.pt... 32: [2022-11-26 09:15:56,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_34-model_00-model_states.pt. 32: [2022-11-26 09:15:56,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_35-model_00-model_states.pt... 0: [2022-11-26 09:15:56,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_15-model_00-model_states.pt. 0: [2022-11-26 09:15:56,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_16-model_00-model_states.pt... 32: [2022-11-26 09:15:56,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_35-model_00-model_states.pt. 32: [2022-11-26 09:15:56,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_36-model_00-model_states.pt... 0: [2022-11-26 09:15:56,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_16-model_00-model_states.pt. 0: [2022-11-26 09:15:56,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_17-model_00-model_states.pt... 32: [2022-11-26 09:15:56,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_36-model_00-model_states.pt. 32: [2022-11-26 09:15:56,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_37-model_00-model_states.pt... 0: [2022-11-26 09:15:56,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_17-model_00-model_states.pt. 0: [2022-11-26 09:15:56,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_18-model_00-model_states.pt... 32: [2022-11-26 09:15:56,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_37-model_00-model_states.pt. 32: [2022-11-26 09:15:56,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_38-model_00-model_states.pt... 0: [2022-11-26 09:15:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_18-model_00-model_states.pt. 0: [2022-11-26 09:15:57,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_19-model_00-model_states.pt... 32: [2022-11-26 09:15:57,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_38-model_00-model_states.pt. 32: [2022-11-26 09:15:57,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_40-model_00-model_states.pt... 32: [2022-11-26 09:15:57,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_40-model_00-model_states.pt. 32: [2022-11-26 09:15:57,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/mp_rank_01_model_states.pt... 32: [2022-11-26 09:15:57,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/mp_rank_01_model_states.pt. 0: [2022-11-26 09:15:57,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_19-model_00-model_states.pt. 0: [2022-11-26 09:15:57,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/layer_20-model_00-model_states.pt... 0: [2022-11-26 09:15:57,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/layer_20-model_00-model_states.pt. 0: [2022-11-26 09:15:57,638] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step24000/mp_rank_00_model_states.pt 0: [2022-11-26 09:15:57,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/mp_rank_00_model_states.pt... 0: [2022-11-26 09:15:57,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/mp_rank_00_model_states.pt. 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 40: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 54: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 44: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:15:57,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:15:57,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:57,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:57,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:57,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:57,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:15:57,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:57,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:57,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:15:57,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:57,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:57,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:15:57,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:57,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:57,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:57,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:57,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:57,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:57,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 09:15:57,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:57,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:57,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 09:15:57,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:57,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:57,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:57,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:57,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:57,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:57,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:57,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:57,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:57,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:57,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:57,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 40: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:57,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 25: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 40: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:57,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:57,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 18: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:57,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 18: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:57,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:57,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:57,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:57,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:57,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:57,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:15:57,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:57,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:57,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 50: [2022-11-26 09:15:57,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:57,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:57,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 13: [2022-11-26 09:15:57,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 35: [2022-11-26 09:15:57,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 13: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:57,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 16: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 61: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:57,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:57,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:57,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:57,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:57,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 09:15:57,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:57,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:15:57,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:57,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 0: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 48: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:57,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:57,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:57,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:57,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:57,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:15:57,926] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 09:15:57,926] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:57,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:57,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:57,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:57,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:57,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:57,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:57,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:57,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:57,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 55: [2022-11-26 09:15:57,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:57,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:57,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:57,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:15:57,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:57,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:57,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:57,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:57,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:57,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:57,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 37: [2022-11-26 09:15:57,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:57,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:57,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:57,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:57,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:57,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:57,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 53: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:57,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 60: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 6: [2022-11-26 09:15:57,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:57,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 60: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:57,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:57,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 52: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 18: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:57,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:57,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:57,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:57,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:57,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:15:57,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:57,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:57,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 09:15:57,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:57,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:57,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:57,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:57,935] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 43: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 5: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 43: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 5: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 42: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:57,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:57,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:57,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:57,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:57,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:57,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:57,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 56: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:57,935] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:57,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:57,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 7: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 56: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:57,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:57,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:57,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:57,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:57,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:57,939] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:57,939] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:57,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 50: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 23: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:15:57,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:57,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:15:57,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:57,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:57,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:57,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:57,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:15:57,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:57,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 55: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 0: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:57,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:57,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:57,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:57,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:57,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:57,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:57,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:57,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 09:15:57,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 37: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:57,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:15:57,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:15:57,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:15:57,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:15:57,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:15:57,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:57,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:57,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:57,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:57,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:57,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:57,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:57,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:57,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:57,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:57,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:57,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:57,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:57,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:57,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:57,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:57,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:57,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 55: [2022-11-26 09:15:57,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:57,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:57,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:57,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:57,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 18: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 48: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 36: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 42: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 10: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 2: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 42: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 2: [2022-11-26 09:15:57,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 42: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:57,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:57,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:57,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 32: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 50: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:57,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:57,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:57,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:57,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:57,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:57,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:15:57,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:57,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:57,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:57,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:57,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 46: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:57,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:57,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 46: [2022-11-26 09:15:57,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:57,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:57,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:57,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 37: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 28: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 36: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:57,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:57,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 09:15:57,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 63: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:57,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:57,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:15:57,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:57,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 53: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:57,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 2: [2022-11-26 09:15:57,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 53: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:57,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 10: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 57: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 10: [2022-11-26 09:15:57,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:57,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 60: [2022-11-26 09:15:57,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 10: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:57,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 60: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:57,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:57,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 45: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 23: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:57,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:57,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:57,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:57,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:57,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:57,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:57,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:57,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:57,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:15:57,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:57,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:57,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:57,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:57,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:57,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:57,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:57,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:57,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:57,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:57,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:57,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:57,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:57,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:57,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 17: [2022-11-26 09:15:57,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 56: [2022-11-26 09:15:57,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:57,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:57,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:57,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:57,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:57,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:57,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:57,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:57,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:57,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:57,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:57,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:57,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:57,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:57,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:57,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:57,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:57,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:57,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:57,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:57,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:15:57,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:57,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:57,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:57,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 32: [2022-11-26 09:15:57,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:15:57,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:57,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 32: [2022-11-26 09:15:57,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:57,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:57,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:57,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:57,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:57,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:57,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:57,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:57,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:57,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:57,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:57,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:57,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:57,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:15:57,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:57,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:57,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:57,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:57,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:57,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:57,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:57,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:57,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:57,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:57,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:57,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:57,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:57,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:57,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:57,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:57,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:57,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:15:57,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:57,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:57,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:57,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 09:15:57,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:57,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:57,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:57,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 14: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 42: [2022-11-26 09:15:57,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:57,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:57,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:57,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:57,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:57,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:57,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 61: [2022-11-26 09:15:57,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:57,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:57,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 46: [2022-11-26 09:15:57,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:57,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:57,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:57,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:57,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:57,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:57,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 40: [2022-11-26 09:15:57,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:57,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:57,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:57,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:57,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:57,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:57,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:15:57,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:57,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 50: [2022-11-26 09:15:57,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:57,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:57,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:57,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 61: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:57,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:57,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:57,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:57,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:57,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:57,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:15:57,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:57,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:57,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:57,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:57,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:57,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:15:57,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:57,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:57,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:57,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:57,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:57,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:57,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:57,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:58,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:58,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:58,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:58,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:58,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:58,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:58,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:58,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:58,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:58,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:58,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:58,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 09:15:58,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 16: [2022-11-26 09:15:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 09:15:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:58,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:58,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:58,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:58,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:58,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:58,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:58,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 09:15:58,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:58,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:58,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:58,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 09:15:58,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:58,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:58,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:58,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:58,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:58,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 15: [2022-11-26 09:15:58,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 36: [2022-11-26 09:15:58,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:58,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:58,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:58,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:58,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:58,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:58,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:15:58,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:58,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:58,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:15:58,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:58,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:58,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:15:58,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 60: [2022-11-26 09:15:58,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:58,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:58,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:58,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 55: [2022-11-26 09:15:58,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:58,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:15:58,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:58,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:58,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:58,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:58,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:58,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:58,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 10: [2022-11-26 09:15:58,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:58,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:58,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:58,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:58,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:58,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:58,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:58,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:58,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:58,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:15:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:58,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:58,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:58,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:58,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:58,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:58,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:58,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 32: [2022-11-26 09:15:58,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:15:58,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 32: [2022-11-26 09:15:58,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 2: [2022-11-26 09:15:58,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:58,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:15:58,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:58,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:58,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:58,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:15:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 09:15:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 39: [2022-11-26 09:15:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 9: [2022-11-26 09:15:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 39: [2022-11-26 09:15:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:58,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:58,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:58,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:58,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:15:58,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:58,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:58,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:15:58,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 09:15:58,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:58,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:15:58,086] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:58,086] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:58,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:58,086] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 61: [2022-11-26 09:15:58,086] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 40: [2022-11-26 09:15:58,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:58,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:58,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:58,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:58,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:58,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:58,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:58,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 46: [2022-11-26 09:15:58,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:58,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:58,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:58,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:58,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:58,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 09:15:58,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:58,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:58,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:58,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:58,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:58,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:58,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:58,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:15:58,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:58,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:58,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:58,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:58,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:58,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:58,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:58,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:58,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:58,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:58,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:58,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:58,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:58,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:58,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:58,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:58,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:58,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:58,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:58,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:58,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 37: [2022-11-26 09:15:58,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:58,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:58,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:58,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:58,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:58,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:58,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:58,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:58,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:15:58,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:58,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:15:58,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:58,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:58,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:58,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:15:58,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:58,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:58,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:58,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 09:15:58,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:58,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:15:58,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:58,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:58,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:15:58,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:58,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 55: [2022-11-26 09:15:58,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:58,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:58,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:58,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:58,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:58,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:58,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:58,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:58,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:58,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:58,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 09:15:58,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:58,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:58,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:58,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 17: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 60: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 17: [2022-11-26 09:15:58,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 60: [2022-11-26 09:15:58,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 10: [2022-11-26 09:15:58,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:58,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 36: [2022-11-26 09:15:58,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:58,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:58,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:58,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:58,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:58,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:58,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:58,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:58,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:58,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:58,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:58,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:58,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:58,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:15:58,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:58,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:58,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:58,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 09:15:58,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:58,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:58,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:58,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:58,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:58,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 2: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:58,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 2: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:58,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:58,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:58,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:58,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:58,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:58,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:58,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:58,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:58,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:58,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:58,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:58,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:58,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:58,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:58,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:58,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:58,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:58,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:15:58,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:58,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:58,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 53: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 32: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 12: [2022-11-26 09:15:58,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:58,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 32: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:58,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:15:58,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 19: [2022-11-26 09:15:58,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 35: [2022-11-26 09:15:58,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 19: [2022-11-26 09:15:58,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:58,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:58,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:58,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 18: [2022-11-26 09:15:58,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:58,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:58,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:58,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:58,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:15:58,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:58,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:58,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:58,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:58,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 40: [2022-11-26 09:15:58,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:58,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:58,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:58,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:15:58,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:58,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:58,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:58,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:58,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:58,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:15:58,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:58,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:58,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:58,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 09:15:58,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:58,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:15:58,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:58,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:58,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:58,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 09:15:58,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 50: [2022-11-26 09:15:58,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:58,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:58,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:58,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:15:58,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 09:15:58,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:58,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:58,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:58,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:58,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 09:15:58,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:58,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:58,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:58,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:58,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:58,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:58,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:58,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:58,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:58,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:58,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:58,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:58,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:58,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:58,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 57: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 11: [2022-11-26 09:15:58,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:58,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:58,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:58,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:58,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:58,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:58,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:58,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:15:58,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:58,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:58,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:58,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 48: [2022-11-26 09:15:58,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:58,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:15:58,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:58,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 37: [2022-11-26 09:15:58,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:58,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:58,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:58,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:58,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 63: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:58,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 26: [2022-11-26 09:15:58,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 63: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:15:58,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 52: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 17: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:58,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:58,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:58,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:15:58,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 0: [2022-11-26 09:15:58,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:58,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:58,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:15:58,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:58,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:58,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:58,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:58,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:15:58,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 60: [2022-11-26 09:15:58,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:58,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:58,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 09:15:58,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:58,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:58,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:58,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 55: [2022-11-26 09:15:58,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:58,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:58,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:58,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:58,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:58,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:58,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:58,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:58,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:58,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 10: [2022-11-26 09:15:58,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:58,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:58,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:58,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:58,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:58,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 2: [2022-11-26 09:15:58,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 2: [2022-11-26 09:15:58,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:58,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:58,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 56: [2022-11-26 09:15:58,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:58,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 41: [2022-11-26 09:15:58,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 15: [2022-11-26 09:15:58,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:58,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:58,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:58,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:58,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:58,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:58,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:58,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:58,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:58,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:58,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:58,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:58,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:58,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:58,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:58,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:58,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:58,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:58,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:58,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:58,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:58,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:58,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:58,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:58,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:58,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:58,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:58,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:58,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:15:58,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:58,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:58,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:58,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 09:15:58,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:58,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:15:58,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:58,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:58,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:58,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:58,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:58,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:15:58,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:58,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:58,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:15:58,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:58,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:58,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:15:58,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:58,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:58,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:58,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:58,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:58,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:58,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:58,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:58,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:58,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 18: [2022-11-26 09:15:58,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:58,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:58,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:58,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:58,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 40: [2022-11-26 09:15:58,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:15:58,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:58,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:58,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:15:58,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 09:15:58,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:58,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:58,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:58,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:58,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:58,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:58,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:58,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 09:15:58,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:58,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 46: [2022-11-26 09:15:58,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 16: [2022-11-26 09:15:58,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:58,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 09:15:58,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:58,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:58,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 61: [2022-11-26 09:15:58,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:58,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:15:58,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 14: [2022-11-26 09:15:58,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 50: [2022-11-26 09:15:58,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:58,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:58,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:58,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:58,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:58,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:58,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:15:58,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:58,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:58,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:58,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:15:58,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:58,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 8: [2022-11-26 09:15:58,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:58,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:58,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:58,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:58,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:58,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:58,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:15:58,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:58,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 52: [2022-11-26 09:15:58,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 22: [2022-11-26 09:15:58,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:15:58,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 09:15:58,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:58,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:58,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:58,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:58,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:58,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 17: [2022-11-26 09:15:58,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:15:58,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 17: [2022-11-26 09:15:58,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:58,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:58,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:58,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:58,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:58,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:58,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:58,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:58,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:58,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:15:58,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:58,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 49: [2022-11-26 09:15:58,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:58,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:58,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:58,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:58,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:15:58,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 09:15:58,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: [2022-11-26 09:15:58,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 48: [2022-11-26 09:15:58,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 0: [2022-11-26 09:15:58,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 48: [2022-11-26 09:15:58,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 0: [2022-11-26 09:15:58,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 48: [2022-11-26 09:15:58,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 45: [2022-11-26 09:15:58,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:15:58,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 45: [2022-11-26 09:15:58,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 55: [2022-11-26 09:15:58,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:58,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 23: [2022-11-26 09:15:58,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:58,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:15:58,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:58,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 23: [2022-11-26 09:15:58,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:58,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 24: [2022-11-26 09:15:58,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:15:58,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 09:15:58,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:15:58,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:58,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:15:58,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:58,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 30: [2022-11-26 09:15:58,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:15:58,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 09:15:58,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 15: [2022-11-26 09:15:58,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:15:58,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 15: [2022-11-26 09:15:58,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:58,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:58,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 36: [2022-11-26 09:15:58,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 56: [2022-11-26 09:15:58,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:15:58,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 56: [2022-11-26 09:15:58,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 12: [2022-11-26 09:15:58,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:15:58,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 12: [2022-11-26 09:15:58,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 25: [2022-11-26 09:15:58,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:15:58,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 25: [2022-11-26 09:15:58,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 1: [2022-11-26 09:15:58,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:15:58,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 1: [2022-11-26 09:15:58,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 35: [2022-11-26 09:15:58,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 4: [2022-11-26 09:15:58,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:15:58,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 35: [2022-11-26 09:15:58,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:58,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 09:15:58,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 3: [2022-11-26 09:15:58,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:15:58,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 3: [2022-11-26 09:15:58,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 62: [2022-11-26 09:15:58,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:15:58,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 09:15:58,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:58,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:15:58,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:58,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:58,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 09:15:58,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 09:15:58,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 6: [2022-11-26 09:15:58,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:15:58,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 6: [2022-11-26 09:15:58,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 10: [2022-11-26 09:15:58,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:15:58,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 09:15:58,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:58,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:15:58,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 60: [2022-11-26 09:15:58,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 31: [2022-11-26 09:15:58,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:58,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:58,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 41: [2022-11-26 09:15:58,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:15:58,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 09:15:58,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 38: [2022-11-26 09:15:58,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:15:58,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 09:15:58,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 18: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 18: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 11: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 5: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 5: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 21: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 53: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 31: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 53: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 16: [2022-11-26 09:15:58,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 31: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 16: [2022-11-26 09:15:58,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 7: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:15:58,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 20: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:15:58,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 20: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:58,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 19: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 58: [2022-11-26 09:15:58,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 19: [2022-11-26 09:15:58,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 58: [2022-11-26 09:15:58,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 50: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:15:58,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:15:58,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 50: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 49: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 40: [2022-11-26 09:15:58,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:58,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 32: [2022-11-26 09:15:58,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 32: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 44: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:15:58,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 57: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 44: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 57: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 47: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 14: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 63: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 14: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 63: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 14: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 63: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 4: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 48: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 4: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 48: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 4: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 8: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 48: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 61: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 8: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 61: [2022-11-26 09:15:58,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 8: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:58,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 43: [2022-11-26 09:15:58,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 51: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 28: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 51: [2022-11-26 09:15:58,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 51: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 28: [2022-11-26 09:15:58,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 46: [2022-11-26 09:15:58,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:15:58,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 55: [2022-11-26 09:15:58,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 59: [2022-11-26 09:15:58,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 59: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 34: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:15:58,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 60: [2022-11-26 09:15:58,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 34: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 60: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 53: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:15:58,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 53: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 13: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:15:58,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 13: [2022-11-26 09:15:58,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 27: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:15:58,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 22: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:15:58,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 22: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 29: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:15:58,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 29: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 36: [2022-11-26 09:15:58,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:15:58,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 09:15:58,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 42: [2022-11-26 09:15:58,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:15:58,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 42: [2022-11-26 09:15:58,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 2: [2022-11-26 09:15:58,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 42: [2022-11-26 09:15:58,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 2: [2022-11-26 09:15:58,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 28: [2022-11-26 09:15:58,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:15:58,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 09:15:58,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 39: [2022-11-26 09:15:58,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:15:58,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 09:15:58,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 26: [2022-11-26 09:15:58,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:15:58,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 26: [2022-11-26 09:15:58,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:58,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:15:58,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:58,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 21: [2022-11-26 09:15:58,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:15:58,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 21: [2022-11-26 09:15:58,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 37: [2022-11-26 09:15:58,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:15:58,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 09:15:58,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 54: [2022-11-26 09:15:58,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:15:58,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 09:15:58,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 9: [2022-11-26 09:15:58,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:15:58,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 09:15:58,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 33: [2022-11-26 09:15:58,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:15:58,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 33: [2022-11-26 09:15:58,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! 0: successfully saved checkpoint at iteration 24000 to checkpoints_3b9 63: time (ms) | save-checkpoint: 5331.40 63: iteration 24010/ 24424 | consumed samples: 12293120 | consumed tokens: 25176309760 | elapsed time per iteration (s): 2.94 | learning rate: 2.013E-05 | global batch size: 512 | lm loss: 1.994645E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.859 | TFLOPs: 17.90 | 63: iteration 24020/ 24424 | consumed samples: 12298240 | consumed tokens: 25186795520 | elapsed time per iteration (s): 2.26 | learning rate: 2.012E-05 | global batch size: 512 | lm loss: 1.974342E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.087 | TFLOPs: 23.27 | 63: iteration 24030/ 24424 | consumed samples: 12303360 | consumed tokens: 25197281280 | elapsed time per iteration (s): 2.26 | learning rate: 2.012E-05 | global batch size: 512 | lm loss: 1.980875E+00 | grad norm: 0.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.711 | TFLOPs: 23.34 | 63: iteration 24040/ 24424 | consumed samples: 12308480 | consumed tokens: 25207767040 | elapsed time per iteration (s): 2.28 | learning rate: 2.011E-05 | global batch size: 512 | lm loss: 1.972531E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.995 | TFLOPs: 23.16 | 63: iteration 24050/ 24424 | consumed samples: 12313600 | consumed tokens: 25218252800 | elapsed time per iteration (s): 2.26 | learning rate: 2.011E-05 | global batch size: 512 | lm loss: 1.962805E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.030 | TFLOPs: 23.37 | 63: iteration 24060/ 24424 | consumed samples: 12318720 | consumed tokens: 25228738560 | elapsed time per iteration (s): 2.26 | learning rate: 2.010E-05 | global batch size: 512 | lm loss: 1.957435E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.184 | TFLOPs: 23.28 | 63: iteration 24070/ 24424 | consumed samples: 12323840 | consumed tokens: 25239224320 | elapsed time per iteration (s): 2.25 | learning rate: 2.010E-05 | global batch size: 512 | lm loss: 1.970605E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.327 | TFLOPs: 23.40 | 63: iteration 24080/ 24424 | consumed samples: 12328960 | consumed tokens: 25249710080 | elapsed time per iteration (s): 2.28 | learning rate: 2.009E-05 | global batch size: 512 | lm loss: 1.972238E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.109 | TFLOPs: 23.07 | 63: iteration 24090/ 24424 | consumed samples: 12334080 | consumed tokens: 25260195840 | elapsed time per iteration (s): 2.29 | learning rate: 2.009E-05 | global batch size: 512 | lm loss: 1.976756E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.539 | TFLOPs: 23.01 | 63: iteration 24100/ 24424 | consumed samples: 12339200 | consumed tokens: 25270681600 | elapsed time per iteration (s): 2.24 | learning rate: 2.008E-05 | global batch size: 512 | lm loss: 1.967300E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.183 | TFLOPs: 23.49 | 63: iteration 24110/ 24424 | consumed samples: 12344320 | consumed tokens: 25281167360 | elapsed time per iteration (s): 2.29 | learning rate: 2.008E-05 | global batch size: 512 | lm loss: 1.966536E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 223.242 | TFLOPs: 22.98 | 63: iteration 24120/ 24424 | consumed samples: 12349440 | consumed tokens: 25291653120 | elapsed time per iteration (s): 2.24 | learning rate: 2.007E-05 | global batch size: 512 | lm loss: 1.958392E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.513 | TFLOPs: 23.52 | 63: iteration 24130/ 24424 | consumed samples: 12354560 | consumed tokens: 25302138880 | elapsed time per iteration (s): 2.26 | learning rate: 2.007E-05 | global batch size: 512 | lm loss: 1.955032E+00 | grad norm: 0.120 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.595 | TFLOPs: 23.33 | 63: iteration 24140/ 24424 | consumed samples: 12359680 | consumed tokens: 25312624640 | elapsed time per iteration (s): 2.60 | learning rate: 2.006E-05 | global batch size: 512 | lm loss: 1.988428E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 196.809 | TFLOPs: 20.26 | 63: iteration 24150/ 24424 | consumed samples: 12364800 | consumed tokens: 25323110400 | elapsed time per iteration (s): 2.26 | learning rate: 2.006E-05 | global batch size: 512 | lm loss: 1.985008E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.856 | TFLOPs: 23.35 | 63: iteration 24160/ 24424 | consumed samples: 12369920 | consumed tokens: 25333596160 | elapsed time per iteration (s): 2.23 | learning rate: 2.005E-05 | global batch size: 512 | lm loss: 1.984414E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.337 | TFLOPs: 23.61 | 63: iteration 24170/ 24424 | consumed samples: 12375040 | consumed tokens: 25344081920 | elapsed time per iteration (s): 2.24 | learning rate: 2.005E-05 | global batch size: 512 | lm loss: 1.959202E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.054 | TFLOPs: 23.58 | 63: iteration 24180/ 24424 | consumed samples: 12380160 | consumed tokens: 25354567680 | elapsed time per iteration (s): 2.23 | learning rate: 2.005E-05 | global batch size: 512 | lm loss: 1.969359E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.840 | TFLOPs: 23.66 | 63: iteration 24190/ 24424 | consumed samples: 12385280 | consumed tokens: 25365053440 | elapsed time per iteration (s): 2.23 | learning rate: 2.004E-05 | global batch size: 512 | lm loss: 1.982051E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.677 | TFLOPs: 23.64 | 63: iteration 24200/ 24424 | consumed samples: 12390400 | consumed tokens: 25375539200 | elapsed time per iteration (s): 2.25 | learning rate: 2.004E-05 | global batch size: 512 | lm loss: 1.963434E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.059 | TFLOPs: 23.48 | 63: iteration 24210/ 24424 | consumed samples: 12395520 | consumed tokens: 25386024960 | elapsed time per iteration (s): 2.24 | learning rate: 2.004E-05 | global batch size: 512 | lm loss: 1.967504E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.617 | TFLOPs: 23.54 | 63: iteration 24220/ 24424 | consumed samples: 12400640 | consumed tokens: 25396510720 | elapsed time per iteration (s): 2.23 | learning rate: 2.003E-05 | global batch size: 512 | lm loss: 1.973684E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.086 | TFLOPs: 23.69 | 63: iteration 24230/ 24424 | consumed samples: 12405760 | consumed tokens: 25406996480 | elapsed time per iteration (s): 2.23 | learning rate: 2.003E-05 | global batch size: 512 | lm loss: 1.969044E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.965 | TFLOPs: 23.67 | 63: iteration 24240/ 24424 | consumed samples: 12410880 | consumed tokens: 25417482240 | elapsed time per iteration (s): 2.25 | learning rate: 2.003E-05 | global batch size: 512 | lm loss: 1.956585E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.738 | TFLOPs: 23.44 | 63: iteration 24250/ 24424 | consumed samples: 12416000 | consumed tokens: 25427968000 | elapsed time per iteration (s): 2.24 | learning rate: 2.002E-05 | global batch size: 512 | lm loss: 1.987387E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.359 | TFLOPs: 23.51 | 63: iteration 24260/ 24424 | consumed samples: 12421120 | consumed tokens: 25438453760 | elapsed time per iteration (s): 2.26 | learning rate: 2.002E-05 | global batch size: 512 | lm loss: 1.971124E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.071 | TFLOPs: 23.27 | 63: iteration 24270/ 24424 | consumed samples: 12426240 | consumed tokens: 25448939520 | elapsed time per iteration (s): 2.22 | learning rate: 2.002E-05 | global batch size: 512 | lm loss: 1.966884E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 230.347 | TFLOPs: 23.71 | 63: iteration 24280/ 24424 | consumed samples: 12431360 | consumed tokens: 25459425280 | elapsed time per iteration (s): 2.24 | learning rate: 2.002E-05 | global batch size: 512 | lm loss: 1.981893E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.179 | TFLOPs: 23.49 | 63: iteration 24290/ 24424 | consumed samples: 12436480 | consumed tokens: 25469911040 | elapsed time per iteration (s): 2.24 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 1.969450E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.063 | TFLOPs: 23.58 | 63: iteration 24300/ 24424 | consumed samples: 12441600 | consumed tokens: 25480396800 | elapsed time per iteration (s): 2.25 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 1.967311E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.256 | TFLOPs: 23.39 | 63: iteration 24310/ 24424 | consumed samples: 12446720 | consumed tokens: 25490882560 | elapsed time per iteration (s): 2.23 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 1.976595E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.853 | TFLOPs: 23.66 | 63: iteration 24320/ 24424 | consumed samples: 12451840 | consumed tokens: 25501368320 | elapsed time per iteration (s): 2.23 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 1.975484E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.945 | TFLOPs: 23.67 | 63: iteration 24330/ 24424 | consumed samples: 12456960 | consumed tokens: 25511854080 | elapsed time per iteration (s): 2.39 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 1.982416E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 213.913 | TFLOPs: 22.02 | 63: iteration 24340/ 24424 | consumed samples: 12462080 | consumed tokens: 25522339840 | elapsed time per iteration (s): 2.26 | learning rate: 2.001E-05 | global batch size: 512 | lm loss: 1.966823E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 226.182 | TFLOPs: 23.28 | 63: iteration 24350/ 24424 | consumed samples: 12467200 | consumed tokens: 25532825600 | elapsed time per iteration (s): 2.24 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.965262E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.612 | TFLOPs: 23.53 | 63: iteration 24360/ 24424 | consumed samples: 12472320 | consumed tokens: 25543311360 | elapsed time per iteration (s): 2.24 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.978983E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.183 | TFLOPs: 23.49 | 63: iteration 24370/ 24424 | consumed samples: 12477440 | consumed tokens: 25553797120 | elapsed time per iteration (s): 2.32 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.962766E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.094 | TFLOPs: 22.76 | 63: iteration 24380/ 24424 | consumed samples: 12482560 | consumed tokens: 25564282880 | elapsed time per iteration (s): 2.24 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.969004E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 228.325 | TFLOPs: 23.50 | 63: iteration 24390/ 24424 | consumed samples: 12487680 | consumed tokens: 25574768640 | elapsed time per iteration (s): 2.27 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.976461E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.799 | TFLOPs: 23.24 | 63: iteration 24400/ 24424 | consumed samples: 12492800 | consumed tokens: 25585254400 | elapsed time per iteration (s): 2.27 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.977694E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 225.806 | TFLOPs: 23.25 | 63: iteration 24410/ 24424 | consumed samples: 12497920 | consumed tokens: 25595740160 | elapsed time per iteration (s): 2.28 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.975448E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 224.847 | TFLOPs: 23.15 | 63: iteration 24420/ 24424 | consumed samples: 12503040 | consumed tokens: 25606225920 | elapsed time per iteration (s): 2.25 | learning rate: 2.000E-05 | global batch size: 512 | lm loss: 1.976127E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 227.165 | TFLOPs: 23.39 | 0: [after training is done] datetime: 2022-11-26 09:31:59 0: saving checkpoint at iteration 24424 to checkpoints_3b9 63: ------------------------------------------------------------------------------------------------------------ 63: valid loss at the end of training for val data | lm loss value: 1.943817E+00 | lm loss PPL: 6.985361E+00 | 63: ------------------------------------------------------------------------------------------------------------ 0: [2022-11-26 09:31:59,595] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step24424 is begin to save! 32: [2022-11-26 09:31:59,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_21-model_00-model_states.pt... 0: [2022-11-26 09:31:59,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_01-model_00-model_states.pt... 32: [2022-11-26 09:31:59,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_21-model_00-model_states.pt. 32: [2022-11-26 09:31:59,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_22-model_00-model_states.pt... 0: [2022-11-26 09:32:00,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_01-model_00-model_states.pt. 0: [2022-11-26 09:32:00,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_03-model_00-model_states.pt... 32: [2022-11-26 09:32:00,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_22-model_00-model_states.pt. 32: [2022-11-26 09:32:00,176] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_23-model_00-model_states.pt... 0: [2022-11-26 09:32:00,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_03-model_00-model_states.pt. 0: [2022-11-26 09:32:00,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_04-model_00-model_states.pt... 32: [2022-11-26 09:32:00,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_23-model_00-model_states.pt. 32: [2022-11-26 09:32:00,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_24-model_00-model_states.pt... 0: [2022-11-26 09:32:00,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_04-model_00-model_states.pt. 0: [2022-11-26 09:32:00,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_05-model_00-model_states.pt... 32: [2022-11-26 09:32:00,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_24-model_00-model_states.pt. 32: [2022-11-26 09:32:00,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_25-model_00-model_states.pt... 0: [2022-11-26 09:32:00,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_05-model_00-model_states.pt. 0: [2022-11-26 09:32:00,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_06-model_00-model_states.pt... 32: [2022-11-26 09:32:00,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_25-model_00-model_states.pt. 32: [2022-11-26 09:32:00,871] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_26-model_00-model_states.pt... 0: [2022-11-26 09:32:00,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_06-model_00-model_states.pt. 0: [2022-11-26 09:32:00,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_07-model_00-model_states.pt... 32: [2022-11-26 09:32:01,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_26-model_00-model_states.pt. 32: [2022-11-26 09:32:01,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_27-model_00-model_states.pt... 0: [2022-11-26 09:32:01,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_07-model_00-model_states.pt. 0: [2022-11-26 09:32:01,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_08-model_00-model_states.pt... 32: [2022-11-26 09:32:01,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_27-model_00-model_states.pt. 32: [2022-11-26 09:32:01,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_28-model_00-model_states.pt... 0: [2022-11-26 09:32:01,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_08-model_00-model_states.pt. 0: [2022-11-26 09:32:01,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_09-model_00-model_states.pt... 32: [2022-11-26 09:32:01,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_28-model_00-model_states.pt. 32: [2022-11-26 09:32:01,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_29-model_00-model_states.pt... 0: [2022-11-26 09:32:01,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_09-model_00-model_states.pt. 0: [2022-11-26 09:32:01,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_10-model_00-model_states.pt... 32: [2022-11-26 09:32:01,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_29-model_00-model_states.pt. 32: [2022-11-26 09:32:01,768] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_30-model_00-model_states.pt... 0: [2022-11-26 09:32:01,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_10-model_00-model_states.pt. 0: [2022-11-26 09:32:01,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_11-model_00-model_states.pt... 32: [2022-11-26 09:32:01,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_30-model_00-model_states.pt. 32: [2022-11-26 09:32:01,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_31-model_00-model_states.pt... 0: [2022-11-26 09:32:02,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_11-model_00-model_states.pt. 0: [2022-11-26 09:32:02,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_12-model_00-model_states.pt... 32: [2022-11-26 09:32:02,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_31-model_00-model_states.pt. 32: [2022-11-26 09:32:02,215] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_32-model_00-model_states.pt... 0: [2022-11-26 09:32:02,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_12-model_00-model_states.pt. 0: [2022-11-26 09:32:02,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_13-model_00-model_states.pt... 32: [2022-11-26 09:32:02,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_32-model_00-model_states.pt. 32: [2022-11-26 09:32:02,431] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_33-model_00-model_states.pt... 0: [2022-11-26 09:32:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_13-model_00-model_states.pt. 0: [2022-11-26 09:32:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_14-model_00-model_states.pt... 32: [2022-11-26 09:32:02,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_33-model_00-model_states.pt. 32: [2022-11-26 09:32:02,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_34-model_00-model_states.pt... 0: [2022-11-26 09:32:02,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_14-model_00-model_states.pt. 0: [2022-11-26 09:32:02,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_15-model_00-model_states.pt... 32: [2022-11-26 09:32:02,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_34-model_00-model_states.pt. 32: [2022-11-26 09:32:02,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_35-model_00-model_states.pt... 0: [2022-11-26 09:32:02,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_15-model_00-model_states.pt. 0: [2022-11-26 09:32:02,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_16-model_00-model_states.pt... 32: [2022-11-26 09:32:03,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_35-model_00-model_states.pt. 32: [2022-11-26 09:32:03,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_36-model_00-model_states.pt... 0: [2022-11-26 09:32:03,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_16-model_00-model_states.pt. 0: [2022-11-26 09:32:03,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_17-model_00-model_states.pt... 32: [2022-11-26 09:32:03,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_36-model_00-model_states.pt. 32: [2022-11-26 09:32:03,308] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_37-model_00-model_states.pt... 0: [2022-11-26 09:32:03,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_17-model_00-model_states.pt. 0: [2022-11-26 09:32:03,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_18-model_00-model_states.pt... 32: [2022-11-26 09:32:03,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_37-model_00-model_states.pt. 32: [2022-11-26 09:32:03,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_38-model_00-model_states.pt... 0: [2022-11-26 09:32:03,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_18-model_00-model_states.pt. 0: [2022-11-26 09:32:03,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_19-model_00-model_states.pt... 32: [2022-11-26 09:32:03,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_38-model_00-model_states.pt. 32: [2022-11-26 09:32:03,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_40-model_00-model_states.pt... 32: [2022-11-26 09:32:03,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_40-model_00-model_states.pt. 32: [2022-11-26 09:32:03,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/mp_rank_01_model_states.pt... 32: [2022-11-26 09:32:03,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/mp_rank_01_model_states.pt. 0: [2022-11-26 09:32:03,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_19-model_00-model_states.pt. 0: [2022-11-26 09:32:03,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/layer_20-model_00-model_states.pt... 0: [2022-11-26 09:32:04,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/layer_20-model_00-model_states.pt. 0: [2022-11-26 09:32:04,001] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_3b9/global_step24424/mp_rank_00_model_states.pt 0: [2022-11-26 09:32:04,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/mp_rank_00_model_states.pt... 0: [2022-11-26 09:32:04,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/mp_rank_00_model_states.pt. 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt... 52: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt... 58: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt... 62: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 40: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 36: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 39: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt... 51: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt... 55: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt... 59: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt... 60: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 33: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 35: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 34: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 46: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt... 48: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 37: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 47: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 45: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 50: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt... 38: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 53: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt... 57: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt... 61: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt... 63: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt... 54: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt... 56: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 12: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt... 28: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt... 44: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 32: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 42: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 41: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 49: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt... 43: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt... 21: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt... 15: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 16: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 13: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt... 20: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 19: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 4: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 14: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 18: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt... 10: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 26: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt... 9: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt... 31: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt... 5: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 1: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt... 24: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 2: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 11: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 17: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt... 25: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt... 6: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 0: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 23: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt... 27: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt... 29: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt... 7: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 22: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt... 30: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt... 8: [2022-11-26 09:32:04,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_3b9/global_step24424/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 3: [2022-11-26 09:32:04,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:32:04,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 3: [2022-11-26 09:32:04,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_138_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_187_mp_rank_00_optim_states.pt 23: [2022-11-26 09:32:04,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:32:04,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 41: [2022-11-26 09:32:04,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:32:04,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_168_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:32:04,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_161_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 11: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 2: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:32:04,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 2: [2022-11-26 09:32:04,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 2: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:32:04,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_196_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_152_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_244_mp_rank_01_optim_states.pt 62: [2022-11-26 09:32:04,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 10: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_254_mp_rank_00_optim_states.pt 55: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt. 31: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_185_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 46: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt. 49: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_136_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 26: [2022-11-26 09:32:04,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_213_mp_rank_00_optim_states.pt 26: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_253_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:32:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 35: [2022-11-26 09:32:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 4: [2022-11-26 09:32:04,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:32:04,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_145_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 7: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:32:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 45: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 5: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 5: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 5: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 45: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:32:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:32:04,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_241_mp_rank_00_optim_states.pt 13: [2022-11-26 09:32:04,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 13: [2022-11-26 09:32:04,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_186_mp_rank_00_optim_states.pt 23: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:32:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 1: [2022-11-26 09:32:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:32:04,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_198_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:32:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt. 0: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 57: [2022-11-26 09:32:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_200_mp_rank_01_optim_states.pt 0: [2022-11-26 09:32:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 57: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:32:04,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_199_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:32:04,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 1: [2022-11-26 09:32:04,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 25: [2022-11-26 09:32:04,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:32:04,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_206_mp_rank_00_optim_states.pt 25: [2022-11-26 09:32:04,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:32:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_192_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:32:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 41: [2022-11-26 09:32:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_142_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_170_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 6: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 34: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 5: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 6: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 48: [2022-11-26 09:32:04,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt. 5: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_129_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_174_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_173_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 4: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_250_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_208_mp_rank_00_optim_states.pt 26: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 3: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 36: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 36: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt. 3: [2022-11-26 09:32:04,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_132_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 51: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt. 18: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:32:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_151_mp_rank_00_optim_states.pt 51: [2022-11-26 09:32:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_157_mp_rank_01_optim_states.pt 18: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 51: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 14: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 46: [2022-11-26 09:32:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 14: [2022-11-26 09:32:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 35: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 35: [2022-11-26 09:32:04,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 2: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 2: [2022-11-26 09:32:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 55: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_188_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_143_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt. 62: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt. 13: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 62: [2022-11-26 09:32:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_243_mp_rank_01_optim_states.pt 13: [2022-11-26 09:32:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 62: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:32:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_243_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:32:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 13: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:32:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_202_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:32:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 30: [2022-11-26 09:32:04,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_244_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 24: [2022-11-26 09:32:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt. 32: [2022-11-26 09:32:04,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 24: [2022-11-26 09:32:04,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_196_mp_rank_00_optim_states.pt 32: [2022-11-26 09:32:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:32:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 37: [2022-11-26 09:32:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 11: [2022-11-26 09:32:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 25: [2022-11-26 09:32:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:32:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_207_mp_rank_00_optim_states.pt 25: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_184_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_251_mp_rank_00_optim_states.pt 47: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_137_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:32:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_249_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:32:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 33: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:32:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 52: [2022-11-26 09:32:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_160_mp_rank_01_optim_states.pt 52: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 10: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 41: [2022-11-26 09:32:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 10: [2022-11-26 09:32:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 41: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 1: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:32:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 4: [2022-11-26 09:32:04,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_227_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 60: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_228_mp_rank_01_optim_states.pt 60: [2022-11-26 09:32:04,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 52: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:32:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_165_mp_rank_01_optim_states.pt 2: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 2: [2022-11-26 09:32:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 2: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:32:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_173_mp_rank_00_optim_states.pt 62: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:32:04,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_205_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 25: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_247_mp_rank_01_optim_states.pt 14: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 56: [2022-11-26 09:32:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_194_mp_rank_01_optim_states.pt 62: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:32:04,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_168_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_251_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_164_mp_rank_00_optim_states.pt 6: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:32:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 32: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:32:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_162_mp_rank_01_optim_states.pt 52: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_242_mp_rank_00_optim_states.pt 30: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_228_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_226_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_231_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_165_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:32:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 6: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:32:04,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:32:04,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 35: [2022-11-26 09:32:04,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_149_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 7: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 3: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 46: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 3: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 42: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 5: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_136_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_209_mp_rank_00_optim_states.pt 26: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 45: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:32:04,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_191_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 51: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_153_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:32:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:32:04,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:32:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 32: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_215_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_209_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_214_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 7: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:32:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_178_mp_rank_00_optim_states.pt 22: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_180_mp_rank_00_optim_states.pt 22: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:32:04,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_143_mp_rank_00_optim_states.pt 22: [2022-11-26 09:32:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_176_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:32:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:32:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 6: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_154_mp_rank_00_optim_states.pt 19: [2022-11-26 09:32:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_158_mp_rank_00_optim_states.pt 19: [2022-11-26 09:32:04,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_152_mp_rank_00_optim_states.pt 19: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 37: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 0: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 33: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:32:04,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 0: [2022-11-26 09:32:04,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 33: [2022-11-26 09:32:04,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:32:04,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 41: [2022-11-26 09:32:04,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:32:04,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 48: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:32:04,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_128_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_191_mp_rank_00_optim_states.pt 23: [2022-11-26 09:32:04,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_147_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_225_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt. 21: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt. 60: [2022-11-26 09:32:04,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_175_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_206_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_146_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_149_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_236_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_234_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_217_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_239_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_232_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt. 61: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_150_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_238_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:32:04,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt. 54: [2022-11-26 09:32:04,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:32:04,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:32:04,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt. 13: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_234_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_237_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_179_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_176_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_177_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_221_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:32:04,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 2: [2022-11-26 09:32:04,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 2: [2022-11-26 09:32:04,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 2: [2022-11-26 09:32:04,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_156_mp_rank_00_optim_states.pt 19: [2022-11-26 09:32:04,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_220_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_229_mp_rank_01_optim_states.pt 60: [2022-11-26 09:32:04,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt. 38: [2022-11-26 09:32:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 29: [2022-11-26 09:32:04,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_232_mp_rank_00_optim_states.pt 38: [2022-11-26 09:32:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:32:04,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 29: [2022-11-26 09:32:04,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_218_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_142_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 52: [2022-11-26 09:32:04,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:32:04,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_166_mp_rank_01_optim_states.pt 52: [2022-11-26 09:32:04,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:32:04,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:32:04,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_240_mp_rank_01_optim_states.pt 62: [2022-11-26 09:32:04,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_195_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 4: [2022-11-26 09:32:04,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_211_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:32:04,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt. 34: [2022-11-26 09:32:04,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 25: [2022-11-26 09:32:04,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_203_mp_rank_00_optim_states.pt 34: [2022-11-26 09:32:04,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 25: [2022-11-26 09:32:04,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:32:04,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_229_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:32:04,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_172_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_160_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:32:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:32:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 33: [2022-11-26 09:32:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:32:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_193_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:32:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_255_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:32:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_187_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:32:04,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:32:04,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_130_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_183_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 45: [2022-11-26 09:32:04,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_255_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:32:04,369] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_183_mp_rank_00_optim_states.pt 22: [2022-11-26 09:32:04,369] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:32:04,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_207_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 35: [2022-11-26 09:32:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 35: [2022-11-26 09:32:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:32:04,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_148_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 51: [2022-11-26 09:32:04,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_158_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_212_mp_rank_00_optim_states.pt 32: [2022-11-26 09:32:04,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 26: [2022-11-26 09:32:04,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 32: [2022-11-26 09:32:04,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:32:04,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 46: [2022-11-26 09:32:04,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:32:04,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 0: [2022-11-26 09:32:04,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 51: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:32:04,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:32:04,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:32:04,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt. 19: [2022-11-26 09:32:04,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:32:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:32:04,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:32:04,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:32:04,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:32:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:32:04,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:32:04,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:32:04,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 33: [2022-11-26 09:32:04,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 1: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:32:04,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 35: [2022-11-26 09:32:04,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 22: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:32:04,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:32:04,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt. 34: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 6: [2022-11-26 09:32:04,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 32: [2022-11-26 09:32:04,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:32:04,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:32:04,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_154_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_190_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_171_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_203_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_235_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_222_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_252_mp_rank_01_optim_states.pt 19: [2022-11-26 09:32:04,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_153_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_171_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 52: [2022-11-26 09:32:04,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_163_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_178_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_197_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_213_mp_rank_01_optim_states.pt 62: [2022-11-26 09:32:04,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_245_mp_rank_01_optim_states.pt 60: [2022-11-26 09:32:04,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_224_mp_rank_01_optim_states.pt 2: [2022-11-26 09:32:04,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 4: [2022-11-26 09:32:04,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_148_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 0: [2022-11-26 09:32:04,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 26: [2022-11-26 09:32:04,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_211_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_139_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 13: [2022-11-26 09:32:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 23: [2022-11-26 09:32:04,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_190_mp_rank_00_optim_states.pt 25: [2022-11-26 09:32:04,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_201_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_235_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_248_mp_rank_00_optim_states.pt 5: [2022-11-26 09:32:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 33: [2022-11-26 09:32:04,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 1: [2022-11-26 09:32:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 7: [2022-11-26 09:32:04,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 3: [2022-11-26 09:32:04,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 22: [2022-11-26 09:32:04,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_181_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 30: [2022-11-26 09:32:04,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_246_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_192_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_224_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_163_mp_rank_00_optim_states.pt 34: [2022-11-26 09:32:04,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 6: [2022-11-26 09:32:04,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 32: [2022-11-26 09:32:04,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 46: [2022-11-26 09:32:04,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_131_mp_rank_01_optim_states.pt 41: [2022-11-26 09:32:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_138_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 45: [2022-11-26 09:32:04,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_145_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 52: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 2: [2022-11-26 09:32:04,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 25: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:32:04,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 51: [2022-11-26 09:32:04,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:32:04,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt. 53: [2022-11-26 09:32:04,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:32:04,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:32:04,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt. 19: [2022-11-26 09:32:04,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:32:04,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:32:04,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:32:04,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:32:04,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:32:04,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:32:04,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:32:04,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt. 11: [2022-11-26 09:32:04,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt. 9: [2022-11-26 09:32:04,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:32:04,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:32:04,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:32:04,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:32:04,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 33: [2022-11-26 09:32:04,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 1: [2022-11-26 09:32:04,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 3: [2022-11-26 09:32:04,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 35: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 22: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:32:04,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:32:04,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt. 34: [2022-11-26 09:32:04,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 6: [2022-11-26 09:32:04,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 32: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:32:04,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:32:04,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:32:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:32:04,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:32:04,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:32:04,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_156_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_186_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_170_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_201_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_237_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_223_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_254_mp_rank_01_optim_states.pt 19: [2022-11-26 09:32:04,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_159_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_172_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 52: [2022-11-26 09:32:04,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_161_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_182_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_193_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_208_mp_rank_01_optim_states.pt 62: [2022-11-26 09:32:04,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_242_mp_rank_01_optim_states.pt 60: [2022-11-26 09:32:04,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_230_mp_rank_01_optim_states.pt 2: [2022-11-26 09:32:04,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-26 09:32:04,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_147_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 26: [2022-11-26 09:32:04,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_215_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_140_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 13: [2022-11-26 09:32:04,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 23: [2022-11-26 09:32:04,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_188_mp_rank_00_optim_states.pt 25: [2022-11-26 09:32:04,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_200_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_252_mp_rank_00_optim_states.pt 5: [2022-11-26 09:32:04,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 33: [2022-11-26 09:32:04,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 1: [2022-11-26 09:32:04,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 7: [2022-11-26 09:32:04,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 3: [2022-11-26 09:32:04,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 30: [2022-11-26 09:32:04,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_247_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_194_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_227_mp_rank_00_optim_states.pt 34: [2022-11-26 09:32:04,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 46: [2022-11-26 09:32:04,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_134_mp_rank_01_optim_states.pt 41: [2022-11-26 09:32:04,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_140_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 45: [2022-11-26 09:32:04,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_150_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 52: [2022-11-26 09:32:04,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 2: [2022-11-26 09:32:04,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 25: [2022-11-26 09:32:04,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_239_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:32:04,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 22: [2022-11-26 09:32:04,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_179_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_162_mp_rank_00_optim_states.pt 34: [2022-11-26 09:32:04,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 32: [2022-11-26 09:32:04,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt. 9: [2022-11-26 09:32:04,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 35: [2022-11-26 09:32:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_238_mp_rank_01_optim_states.pt 9: [2022-11-26 09:32:04,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 3: [2022-11-26 09:32:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 5: [2022-11-26 09:32:04,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 5: [2022-11-26 09:32:04,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_137_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:32:04,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 37: [2022-11-26 09:32:04,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 7: [2022-11-26 09:32:04,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 37: [2022-11-26 09:32:04,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 7: [2022-11-26 09:32:04,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_185_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_197_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:32:04,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 6: [2022-11-26 09:32:04,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 30: [2022-11-26 09:32:04,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt. 15: [2022-11-26 09:32:04,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 30: [2022-11-26 09:32:04,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_240_mp_rank_00_optim_states.pt 30: [2022-11-26 09:32:04,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 1: [2022-11-26 09:32:04,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 1: [2022-11-26 09:32:04,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 1: [2022-11-26 09:32:04,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt. 21: [2022-11-26 09:32:04,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_169_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 41: [2022-11-26 09:32:04,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 41: [2022-11-26 09:32:04,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt. 18: [2022-11-26 09:32:04,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_146_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_233_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-26 09:32:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_155_mp_rank_00_optim_states.pt 19: [2022-11-26 09:32:04,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 13: [2022-11-26 09:32:04,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 13: [2022-11-26 09:32:04,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 42: [2022-11-26 09:32:04,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 4: [2022-11-26 09:32:04,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt. 56: [2022-11-26 09:32:04,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_198_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_219_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_230_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 52: [2022-11-26 09:32:04,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt. 52: [2022-11-26 09:32:04,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_167_mp_rank_01_optim_states.pt 52: [2022-11-26 09:32:04,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 2: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 2: [2022-11-26 09:32:04,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 2: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_167_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 11: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 44: [2022-11-26 09:32:04,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-26 09:32:04,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt. 62: [2022-11-26 09:32:04,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_241_mp_rank_01_optim_states.pt 62: [2022-11-26 09:32:04,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_210_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt. 49: [2022-11-26 09:32:04,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_139_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 60: [2022-11-26 09:32:04,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_231_mp_rank_01_optim_states.pt 60: [2022-11-26 09:32:04,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:32:04,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_182_mp_rank_00_optim_states.pt 22: [2022-11-26 09:32:04,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:32:04,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt. 25: [2022-11-26 09:32:04,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_202_mp_rank_00_optim_states.pt 25: [2022-11-26 09:32:04,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:32:04,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 34: [2022-11-26 09:32:04,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_180_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt. 55: [2022-11-26 09:32:04,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_184_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: [2022-11-26 09:32:04,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 0: [2022-11-26 09:32:04,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 33: [2022-11-26 09:32:04,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 0: [2022-11-26 09:32:04,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 33: [2022-11-26 09:32:04,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_249_mp_rank_01_optim_states.pt 63: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 32: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 45: [2022-11-26 09:32:04,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 5: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 45: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 5: [2022-11-26 09:32:04,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 32: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 5: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 46: [2022-11-26 09:32:04,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 3: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 46: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 3: [2022-11-26 09:32:04,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 50: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt. 3: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_144_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 8: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 53: [2022-11-26 09:32:04,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt. 21: [2022-11-26 09:32:04,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt. 53: [2022-11-26 09:32:04,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_175_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 21: [2022-11-26 09:32:04,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_174_mp_rank_00_optim_states.pt 21: [2022-11-26 09:32:04,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt. 48: [2022-11-26 09:32:04,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_133_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 15: [2022-11-26 09:32:04,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:32:04,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 15: [2022-11-26 09:32:04,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 15: [2022-11-26 09:32:04,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 35: [2022-11-26 09:32:04,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 23: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt. 23: [2022-11-26 09:32:04,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_189_mp_rank_00_optim_states.pt 23: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 37: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 37: [2022-11-26 09:32:04,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 37: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 10: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 10: [2022-11-26 09:32:04,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 10: [2022-11-26 09:32:04,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 26: [2022-11-26 09:32:04,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_210_mp_rank_00_optim_states.pt 26: [2022-11-26 09:32:04,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 42: [2022-11-26 09:32:04,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 42: [2022-11-26 09:32:04,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 42: [2022-11-26 09:32:04,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt. 9: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 8: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 57: [2022-11-26 09:32:04,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_204_mp_rank_01_optim_states.pt 8: [2022-11-26 09:32:04,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 9: [2022-11-26 09:32:04,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 8: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 9: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 61: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt. 61: [2022-11-26 09:32:04,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_233_mp_rank_01_optim_states.pt 61: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 24: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt. 24: [2022-11-26 09:32:04,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_195_mp_rank_00_optim_states.pt 24: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 28: [2022-11-26 09:32:04,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt. 28: [2022-11-26 09:32:04,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_225_mp_rank_00_optim_states.pt 28: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 17: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt. 17: [2022-11-26 09:32:04,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_141_mp_rank_00_optim_states.pt 17: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 51: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt. 51: [2022-11-26 09:32:04,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_155_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_159_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 44: [2022-11-26 09:32:04,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 51: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 44: [2022-11-26 09:32:04,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 34: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 34: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 48: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt. 30: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt. 34: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 48: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_135_mp_rank_01_optim_states.pt 2: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 48: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 30: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_245_mp_rank_00_optim_states.pt 2: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 41: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt. 2: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 35: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 30: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 41: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 56: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_199_mp_rank_01_optim_states.pt 35: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 56: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt. 35: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_129_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 22: [2022-11-26 09:32:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt. 22: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_177_mp_rank_00_optim_states.pt 22: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_130_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_250_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 55: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt. 62: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt. 25: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_204_mp_rank_00_optim_states.pt 55: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_189_mp_rank_01_optim_states.pt 55: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_246_mp_rank_01_optim_states.pt 25: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 62: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 19: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt. 7: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 19: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_157_mp_rank_00_optim_states.pt 7: [2022-11-26 09:32:04,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 19: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 7: [2022-11-26 09:32:04,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 57: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt. 57: [2022-11-26 09:32:04,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_205_mp_rank_01_optim_states.pt 57: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 14: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 14: [2022-11-26 09:32:04,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 14: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 39: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 39: [2022-11-26 09:32:04,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 39: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 58: [2022-11-26 09:32:04,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt. 58: [2022-11-26 09:32:04,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_212_mp_rank_01_optim_states.pt 58: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 6: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 6: [2022-11-26 09:32:04,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 6: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 20: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt. 20: [2022-11-26 09:32:04,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_166_mp_rank_00_optim_states.pt 20: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 54: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt. 54: [2022-11-26 09:32:04,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_181_mp_rank_01_optim_states.pt 54: [2022-11-26 09:32:04,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 52: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt. 18: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt. 52: [2022-11-26 09:32:04,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_164_mp_rank_01_optim_states.pt 52: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 18: [2022-11-26 09:32:04,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_144_mp_rank_00_optim_states.pt 18: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 13: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 60: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt. 13: [2022-11-26 09:32:04,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 60: [2022-11-26 09:32:04,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_226_mp_rank_01_optim_states.pt 13: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt. 60: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 49: [2022-11-26 09:32:04,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_141_mp_rank_01_optim_states.pt 49: [2022-11-26 09:32:04,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 38: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 38: [2022-11-26 09:32:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 38: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 26: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt. 12: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 26: [2022-11-26 09:32:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_214_mp_rank_00_optim_states.pt 12: [2022-11-26 09:32:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 32: [2022-11-26 09:32:04,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 26: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 12: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 32: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 4: [2022-11-26 09:32:04,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 4: [2022-11-26 09:32:04,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 53: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt. 4: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 53: [2022-11-26 09:32:04,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_169_mp_rank_01_optim_states.pt 53: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 11: [2022-11-26 09:32:04,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 11: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 59: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt. 59: [2022-11-26 09:32:04,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_216_mp_rank_01_optim_states.pt 59: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 33: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 33: [2022-11-26 09:32:04,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 33: [2022-11-26 09:32:04,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 29: [2022-11-26 09:32:04,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt. 29: [2022-11-26 09:32:04,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_236_mp_rank_00_optim_states.pt 29: [2022-11-26 09:32:04,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 1: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 45: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 1: [2022-11-26 09:32:04,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 47: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 1: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 45: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 46: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt. 63: [2022-11-26 09:32:04,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_248_mp_rank_01_optim_states.pt 46: [2022-11-26 09:32:04,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 46: [2022-11-26 09:32:04,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 63: [2022-11-26 09:32:04,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 50: [2022-11-26 09:32:04,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt. 50: [2022-11-26 09:32:04,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_151_mp_rank_01_optim_states.pt 50: [2022-11-26 09:32:04,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 43: [2022-11-26 09:32:04,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 43: [2022-11-26 09:32:04,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 43: [2022-11-26 09:32:04,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 31: [2022-11-26 09:32:04,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt. 31: [2022-11-26 09:32:04,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_253_mp_rank_00_optim_states.pt 31: [2022-11-26 09:32:04,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_133_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 36: [2022-11-26 09:32:04,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 36: [2022-11-26 09:32:04,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 36: [2022-11-26 09:32:04,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 47: [2022-11-26 09:32:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 47: [2022-11-26 09:32:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 47: [2022-11-26 09:32:04,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_217_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_219_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_222_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_221_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_135_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_218_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_220_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_216_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 27: [2022-11-26 09:32:04,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt. 27: [2022-11-26 09:32:04,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_223_mp_rank_00_optim_states.pt 27: [2022-11-26 09:32:04,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_131_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_132_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt. 16: [2022-11-26 09:32:04,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_134_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_128_mp_rank_00_optim_states.pt 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 16: [2022-11-26 09:32:04,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 40: [2022-11-26 09:32:04,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_3b9/global_step24424/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 40: [2022-11-26 09:32:04,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24424 is ready now! 0: successfully saved checkpoint at iteration 24424 to checkpoints_3b9 63: ------------------------------------------------------------------------------------------------------------ 63: test loss at the end of training for test data | lm loss value: 1.985722E+00 | lm loss PPL: 7.284306E+00 | 63: ------------------------------------------------------------------------------------------------------------ 33: END 2068237: Sat Nov 26 09:32:24 EET 2022 47: END 2068237: Sat Nov 26 09:32:24 EET 2022 61: END 2068237: Sat Nov 26 09:32:24 EET 2022 31: END 2068237: Sat Nov 26 09:32:24 EET 2022 22: END 2068237: Sat Nov 26 09:32:24 EET 2022 36: END 2068237: Sat Nov 26 09:32:24 EET 2022 32: END 2068237: Sat Nov 26 09:32:24 EET 2022 12: END 2068237: Sat Nov 26 09:32:24 EET 2022 55: END 2068237: Sat Nov 26 09:32:24 EET 2022 14: END 2068237: Sat Nov 26 09:32:24 EET 2022 0: END 2068237: Sat Nov 26 09:32:24 EET 2022 13: END 2068237: Sat Nov 26 09:32:24 EET 2022 27: END 2068237: Sat Nov 26 09:32:24 EET 2022 19: END 2068237: Sat Nov 26 09:32:24 EET 2022 21: END 2068237: Sat Nov 26 09:32:24 EET 2022 46: END 2068237: Sat Nov 26 09:32:24 EET 2022 24: END 2068237: Sat Nov 26 09:32:24 EET 2022 52: END 2068237: Sat Nov 26 09:32:24 EET 2022 60: END 2068237: Sat Nov 26 09:32:24 EET 2022 48: END 2068237: Sat Nov 26 09:32:24 EET 2022 10: END 2068237: Sat Nov 26 09:32:24 EET 2022 41: END 2068237: Sat Nov 26 09:32:24 EET 2022 17: END 2068237: Sat Nov 26 09:32:24 EET 2022 53: END 2068237: Sat Nov 26 09:32:24 EET 2022 25: END 2068237: Sat Nov 26 09:32:24 EET 2022 63: END 2068237: Sat Nov 26 09:32:24 EET 2022 5: END 2068237: Sat Nov 26 09:32:24 EET 2022 59: END 2068237: Sat Nov 26 09:32:24 EET 2022 62: END 2068237: Sat Nov 26 09:32:24 EET 2022 42: END 2068237: Sat Nov 26 09:32:24 EET 2022 1: END 2068237: Sat Nov 26 09:32:24 EET 2022 20: END 2068237: Sat Nov 26 09:32:24 EET 2022 58: END 2068237: Sat Nov 26 09:32:24 EET 2022 34: END 2068237: Sat Nov 26 09:32:24 EET 2022 16: END 2068237: Sat Nov 26 09:32:24 EET 2022 23: END 2068237: Sat Nov 26 09:32:24 EET 2022 29: END 2068237: Sat Nov 26 09:32:24 EET 2022 6: END 2068237: Sat Nov 26 09:32:24 EET 2022 37: END 2068237: Sat Nov 26 09:32:24 EET 2022 56: END 2068237: Sat Nov 26 09:32:24 EET 2022 8: END 2068237: Sat Nov 26 09:32:24 EET 2022 26: END 2068237: Sat Nov 26 09:32:24 EET 2022 57: END 2068237: Sat Nov 26 09:32:24 EET 2022 18: END 2068237: Sat Nov 26 09:32:24 EET 2022 11: END 2068237: Sat Nov 26 09:32:24 EET 2022 7: END 2068237: Sat Nov 26 09:32:24 EET 2022 28: END 2068237: Sat Nov 26 09:32:24 EET 2022 40: END 2068237: Sat Nov 26 09:32:24 EET 2022 44: END 2068237: Sat Nov 26 09:32:24 EET 2022 45: END 2068237: Sat Nov 26 09:32:24 EET 2022 39: END 2068237: Sat Nov 26 09:32:24 EET 2022 38: END 2068237: Sat Nov 26 09:32:24 EET 2022 9: END 2068237: Sat Nov 26 09:32:24 EET 2022 3: END 2068237: Sat Nov 26 09:32:24 EET 2022 30: END 2068237: Sat Nov 26 09:32:24 EET 2022 15: END 2068237: Sat Nov 26 09:32:24 EET 2022 4: END 2068237: Sat Nov 26 09:32:24 EET 2022 35: END 2068237: Sat Nov 26 09:32:24 EET 2022 43: END 2068237: Sat Nov 26 09:32:24 EET 2022 50: END 2068237: Sat Nov 26 09:32:24 EET 2022 2: END 2068237: Sat Nov 26 09:32:24 EET 2022 54: END 2068237: Sat Nov 26 09:32:24 EET 2022 49: END 2068237: Sat Nov 26 09:32:24 EET 2022 51: END 2068237: Sat Nov 26 09:32:24 EET 2022